secondo/Tools/Generators/TextRelations/extractPages

#
# This script converts a given pdf file into a set of
# textfiles
#

showUsage()
{
echo "Usage extractPages [-secondo] <inputfile>"
echo " <inputfile> must be an pdf file "
echo ""
echo "Without the -secondo option a new directory"
echo "  ist created and all pages of the input file"
echo "  are written as ascii text into separate files"
echo "  additionally, double pages are also written into"
echo "  this directory"
echo ""
echo "Using the -secondo option single pages and double"
echo "  pages are written as tuples of a secondo relation "
echo " formatted as follow "
echo "   Filename  : string = the used filename "
echo "   Doublepage: bool = false if single page true if double page"
echo "   Page: int = the (first) exracted page "
echo "   Content : text = the content of the extracted page(s)"
}

if [ -z $1  ]; then
  echo "missing filename" >&2
  showUsage
  exit 1
fi


GS=gs

if [ $SECONDO_PLATFORM == "win32" ]; then
  GS=gswin32c.exe
fi

SECONDO="false"

if [ "$1" == "-secondo" ]; then
  SECONDO="true"
  shift
fi

if [ -z $1  ]; then
  echo "missing filename" >&2
  showUsage
  exit 1
fi

INFILE=$1

if [ ! -f $INFILE ]; then
   echo "file not found " >&2
   exit 1
fi

SIMPLEFILE=$(basename $INFILE .pdf)

DIR="$SIMPLEFILE""_pages"


if [ -e $DIR ]; then
    echo "file " $DIR " exists - please remove it before running this script" >&2
    exit 1;
fi

mkdir $DIR

BASEFILE="$DIR""/""$SIMPLEFILE"
PSFILE="$BASEFILE"".ps"

PAGENUMBER=$(pdf2ps $INFILE - | tee $PSFILE | psselect -p1-  2>&1 >/dev/null | grep "Wrote" | sed "s/\(.*Wrote *\)\([0-9]*\).*/\2/g")

# extract single pages

PAGE=1
while [ "$PAGE" -le "$PAGENUMBER" ]; do
  if [ "$SECONDO" != "true" ]; then
      pdftotext -f $PAGE -l $PAGE -enc Latin1 -nopgbrk $INFILE "$BASEFILE""_P_""$PAGE"".txt"
  else
      FILENAME="$BASEFILE""_P_""$PAGE"".pdf"
     # ectract single page as pdf
      psselect -p$PAGE $PSFILE | $GS -sDEVICE=pdfwrite -dNOPAUSE -dBATCH -sOutputFile=$FILENAME -q -
      echo "(" # begin tuple
      echo " \"$INFILE\" " # name of the file
      echo " FALSE "       # not a double page
      echo " $PAGE "       # the page, used as string to allow
      echo " <file>$FILENAME</file--->"
      echo -n "<text>"
      pdftotext -f $PAGE -l $PAGE -enc Latin1 -nopgbrk $INFILE -
      echo  "</text---> )"
  fi
  PAGE=$(expr $PAGE + 1)
done


# extract double pages

PAGE=1
while [ "$PAGE" -le "$PAGENUMBER" ]; do
  PAGE2=$(expr $PAGE + 1)
  if [ "$SECONDO" != "true" ]; then
      pdftotext -f $PAGE -l $PAGE2 $INFILE -enc Latin1 -nopgbrk "$BASEFILE""_DP_""$PAGE""_""$PAGE2"".txt"
  else
      FILENAME="$BASEFILE""_DP_""$PAGE""_""$PAGE2"".pdf"
      psselect -p"$PAGE""-""$PAGE2" $PSFILE | $GS -sDEVICE=pdfwrite -dNOPAUSE -dBATCH -sOutputFile=$FILENAME -q -
      echo "(" # begin tuple
      echo " \"$INFILE\" " # name of the file
      echo " TRUE "       # not a double page
      echo " $PAGE "       # the page, used as string to allow
      echo " <file>$FILENAME</file--->"
      echo -n "<text>"
      pdftotext -f $PAGE -l $PAGE2 -enc Latin1 -nopgbrk $INFILE -
      echo  "</text---> )"
  fi
  PAGE=$(expr $PAGE + 1)
done