# # This script converts a given pdf file into a set of # textfiles # showUsage() { echo "Usage extractPages [-secondo] " echo " must be an pdf file " echo "" echo "Without the -secondo option a new directory" echo " ist created and all pages of the input file" echo " are written as ascii text into separate files" echo " additionally, double pages are also written into" echo " this directory" echo "" echo "Using the -secondo option single pages and double" echo " pages are written as tuples of a secondo relation " echo " formatted as follow " echo " Filename : string = the used filename " echo " Doublepage: bool = false if single page true if double page" echo " Page: int = the (first) exracted page " echo " Content : text = the content of the extracted page(s)" } if [ -z $1 ]; then echo "missing filename" >&2 showUsage exit 1 fi GS=gs if [ $SECONDO_PLATFORM == "win32" ]; then GS=gswin32c.exe fi SECONDO="false" if [ "$1" == "-secondo" ]; then SECONDO="true" shift fi if [ -z $1 ]; then echo "missing filename" >&2 showUsage exit 1 fi INFILE=$1 if [ ! -f $INFILE ]; then echo "file not found " >&2 exit 1 fi SIMPLEFILE=$(basename $INFILE .pdf) DIR="$SIMPLEFILE""_pages" if [ -e $DIR ]; then echo "file " $DIR " exists - please remove it before running this script" >&2 exit 1; fi mkdir $DIR BASEFILE="$DIR""/""$SIMPLEFILE" PSFILE="$BASEFILE"".ps" PAGENUMBER=$(pdf2ps $INFILE - | tee $PSFILE | psselect -p1- 2>&1 >/dev/null | grep "Wrote" | sed "s/\(.*Wrote *\)\([0-9]*\).*/\2/g") # extract single pages PAGE=1 while [ "$PAGE" -le "$PAGENUMBER" ]; do if [ "$SECONDO" != "true" ]; then pdftotext -f $PAGE -l $PAGE -enc Latin1 -nopgbrk $INFILE "$BASEFILE""_P_""$PAGE"".txt" else FILENAME="$BASEFILE""_P_""$PAGE"".pdf" # ectract single page as pdf psselect -p$PAGE $PSFILE | $GS -sDEVICE=pdfwrite -dNOPAUSE -dBATCH -sOutputFile=$FILENAME -q - echo "(" # begin tuple echo " \"$INFILE\" " # name of the file echo " FALSE " # not a double page echo " $PAGE " # the page, used as string to allow echo " $FILENAME" echo -n "" pdftotext -f $PAGE -l $PAGE -enc Latin1 -nopgbrk $INFILE - echo " )" fi PAGE=$(expr $PAGE + 1) done # extract double pages PAGE=1 while [ "$PAGE" -le "$PAGENUMBER" ]; do PAGE2=$(expr $PAGE + 1) if [ "$SECONDO" != "true" ]; then pdftotext -f $PAGE -l $PAGE2 $INFILE -enc Latin1 -nopgbrk "$BASEFILE""_DP_""$PAGE""_""$PAGE2"".txt" else FILENAME="$BASEFILE""_DP_""$PAGE""_""$PAGE2"".pdf" psselect -p"$PAGE""-""$PAGE2" $PSFILE | $GS -sDEVICE=pdfwrite -dNOPAUSE -dBATCH -sOutputFile=$FILENAME -q - echo "(" # begin tuple echo " \"$INFILE\" " # name of the file echo " TRUE " # not a double page echo " $PAGE " # the page, used as string to allow echo " $FILENAME" echo -n "" pdftotext -f $PAGE -l $PAGE2 -enc Latin1 -nopgbrk $INFILE - echo " )" fi PAGE=$(expr $PAGE + 1) done