127 lines
3.1 KiB
Plaintext
127 lines
3.1 KiB
Plaintext
#
|
|
# This script converts a given pdf file into a set of
|
|
# textfiles
|
|
#
|
|
|
|
showUsage()
|
|
{
|
|
echo "Usage extractPages [-secondo] <inputfile>"
|
|
echo " <inputfile> must be an pdf file "
|
|
echo ""
|
|
echo "Without the -secondo option a new directory"
|
|
echo " ist created and all pages of the input file"
|
|
echo " are written as ascii text into separate files"
|
|
echo " additionally, double pages are also written into"
|
|
echo " this directory"
|
|
echo ""
|
|
echo "Using the -secondo option single pages and double"
|
|
echo " pages are written as tuples of a secondo relation "
|
|
echo " formatted as follow "
|
|
echo " Filename : string = the used filename "
|
|
echo " Doublepage: bool = false if single page true if double page"
|
|
echo " Page: int = the (first) exracted page "
|
|
echo " Content : text = the content of the extracted page(s)"
|
|
}
|
|
|
|
if [ -z $1 ]; then
|
|
echo "missing filename" >&2
|
|
showUsage
|
|
exit 1
|
|
fi
|
|
|
|
|
|
GS=gs
|
|
|
|
if [ $SECONDO_PLATFORM == "win32" ]; then
|
|
GS=gswin32c.exe
|
|
fi
|
|
|
|
SECONDO="false"
|
|
|
|
if [ "$1" == "-secondo" ]; then
|
|
SECONDO="true"
|
|
shift
|
|
fi
|
|
|
|
if [ -z $1 ]; then
|
|
echo "missing filename" >&2
|
|
showUsage
|
|
exit 1
|
|
fi
|
|
|
|
INFILE=$1
|
|
|
|
if [ ! -f $INFILE ]; then
|
|
echo "file not found " >&2
|
|
exit 1
|
|
fi
|
|
|
|
SIMPLEFILE=$(basename $INFILE .pdf)
|
|
|
|
DIR="$SIMPLEFILE""_pages"
|
|
|
|
|
|
if [ -e $DIR ]; then
|
|
echo "file " $DIR " exists - please remove it before running this script" >&2
|
|
exit 1;
|
|
fi
|
|
|
|
mkdir $DIR
|
|
|
|
BASEFILE="$DIR""/""$SIMPLEFILE"
|
|
PSFILE="$BASEFILE"".ps"
|
|
|
|
PAGENUMBER=$(pdf2ps $INFILE - | tee $PSFILE | psselect -p1- 2>&1 >/dev/null | grep "Wrote" | sed "s/\(.*Wrote *\)\([0-9]*\).*/\2/g")
|
|
|
|
# extract single pages
|
|
|
|
PAGE=1
|
|
while [ "$PAGE" -le "$PAGENUMBER" ]; do
|
|
if [ "$SECONDO" != "true" ]; then
|
|
pdftotext -f $PAGE -l $PAGE -enc Latin1 -nopgbrk $INFILE "$BASEFILE""_P_""$PAGE"".txt"
|
|
else
|
|
FILENAME="$BASEFILE""_P_""$PAGE"".pdf"
|
|
# ectract single page as pdf
|
|
psselect -p$PAGE $PSFILE | $GS -sDEVICE=pdfwrite -dNOPAUSE -dBATCH -sOutputFile=$FILENAME -q -
|
|
echo "(" # begin tuple
|
|
echo " \"$INFILE\" " # name of the file
|
|
echo " FALSE " # not a double page
|
|
echo " $PAGE " # the page, used as string to allow
|
|
echo " <file>$FILENAME</file--->"
|
|
echo -n "<text>"
|
|
pdftotext -f $PAGE -l $PAGE -enc Latin1 -nopgbrk $INFILE -
|
|
echo "</text---> )"
|
|
fi
|
|
PAGE=$(expr $PAGE + 1)
|
|
done
|
|
|
|
|
|
# extract double pages
|
|
|
|
PAGE=1
|
|
while [ "$PAGE" -le "$PAGENUMBER" ]; do
|
|
PAGE2=$(expr $PAGE + 1)
|
|
if [ "$SECONDO" != "true" ]; then
|
|
pdftotext -f $PAGE -l $PAGE2 $INFILE -enc Latin1 -nopgbrk "$BASEFILE""_DP_""$PAGE""_""$PAGE2"".txt"
|
|
else
|
|
FILENAME="$BASEFILE""_DP_""$PAGE""_""$PAGE2"".pdf"
|
|
psselect -p"$PAGE""-""$PAGE2" $PSFILE | $GS -sDEVICE=pdfwrite -dNOPAUSE -dBATCH -sOutputFile=$FILENAME -q -
|
|
echo "(" # begin tuple
|
|
echo " \"$INFILE\" " # name of the file
|
|
echo " TRUE " # not a double page
|
|
echo " $PAGE " # the page, used as string to allow
|
|
echo " <file>$FILENAME</file--->"
|
|
echo -n "<text>"
|
|
pdftotext -f $PAGE -l $PAGE2 -enc Latin1 -nopgbrk $INFILE -
|
|
echo "</text---> )"
|
|
fi
|
|
PAGE=$(expr $PAGE + 1)
|
|
done
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|