Files

127 lines
3.1 KiB
Plaintext
Raw Permalink Normal View History

2026-01-23 17:03:45 +08:00
#
# This script converts a given pdf file into a set of
# textfiles
#
showUsage()
{
echo "Usage extractPages [-secondo] <inputfile>"
echo " <inputfile> must be an pdf file "
echo ""
echo "Without the -secondo option a new directory"
echo " ist created and all pages of the input file"
echo " are written as ascii text into separate files"
echo " additionally, double pages are also written into"
echo " this directory"
echo ""
echo "Using the -secondo option single pages and double"
echo " pages are written as tuples of a secondo relation "
echo " formatted as follow "
echo " Filename : string = the used filename "
echo " Doublepage: bool = false if single page true if double page"
echo " Page: int = the (first) exracted page "
echo " Content : text = the content of the extracted page(s)"
}
if [ -z $1 ]; then
echo "missing filename" >&2
showUsage
exit 1
fi
GS=gs
if [ $SECONDO_PLATFORM == "win32" ]; then
GS=gswin32c.exe
fi
SECONDO="false"
if [ "$1" == "-secondo" ]; then
SECONDO="true"
shift
fi
if [ -z $1 ]; then
echo "missing filename" >&2
showUsage
exit 1
fi
INFILE=$1
if [ ! -f $INFILE ]; then
echo "file not found " >&2
exit 1
fi
SIMPLEFILE=$(basename $INFILE .pdf)
DIR="$SIMPLEFILE""_pages"
if [ -e $DIR ]; then
echo "file " $DIR " exists - please remove it before running this script" >&2
exit 1;
fi
mkdir $DIR
BASEFILE="$DIR""/""$SIMPLEFILE"
PSFILE="$BASEFILE"".ps"
PAGENUMBER=$(pdf2ps $INFILE - | tee $PSFILE | psselect -p1- 2>&1 >/dev/null | grep "Wrote" | sed "s/\(.*Wrote *\)\([0-9]*\).*/\2/g")
# extract single pages
PAGE=1
while [ "$PAGE" -le "$PAGENUMBER" ]; do
if [ "$SECONDO" != "true" ]; then
pdftotext -f $PAGE -l $PAGE -enc Latin1 -nopgbrk $INFILE "$BASEFILE""_P_""$PAGE"".txt"
else
FILENAME="$BASEFILE""_P_""$PAGE"".pdf"
# ectract single page as pdf
psselect -p$PAGE $PSFILE | $GS -sDEVICE=pdfwrite -dNOPAUSE -dBATCH -sOutputFile=$FILENAME -q -
echo "(" # begin tuple
echo " \"$INFILE\" " # name of the file
echo " FALSE " # not a double page
echo " $PAGE " # the page, used as string to allow
echo " <file>$FILENAME</file--->"
echo -n "<text>"
pdftotext -f $PAGE -l $PAGE -enc Latin1 -nopgbrk $INFILE -
echo "</text---> )"
fi
PAGE=$(expr $PAGE + 1)
done
# extract double pages
PAGE=1
while [ "$PAGE" -le "$PAGENUMBER" ]; do
PAGE2=$(expr $PAGE + 1)
if [ "$SECONDO" != "true" ]; then
pdftotext -f $PAGE -l $PAGE2 $INFILE -enc Latin1 -nopgbrk "$BASEFILE""_DP_""$PAGE""_""$PAGE2"".txt"
else
FILENAME="$BASEFILE""_DP_""$PAGE""_""$PAGE2"".pdf"
psselect -p"$PAGE""-""$PAGE2" $PSFILE | $GS -sDEVICE=pdfwrite -dNOPAUSE -dBATCH -sOutputFile=$FILENAME -q -
echo "(" # begin tuple
echo " \"$INFILE\" " # name of the file
echo " TRUE " # not a double page
echo " $PAGE " # the page, used as string to allow
echo " <file>$FILENAME</file--->"
echo -n "<text>"
pdftotext -f $PAGE -l $PAGE2 -enc Latin1 -nopgbrk $INFILE -
echo "</text---> )"
fi
PAGE=$(expr $PAGE + 1)
done