Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Both sides previous revision Previous revision
ingesting:ingbscript [2012/11/27 10:28]
giancarlo
ingesting:ingbscript [2012/11/27 11:02] (current)
giancarlo
Line 47: Line 47:
 <WRAP prewrap center> <WRAP prewrap center>
 <code bash ingbookepages.sh.sh> <code bash ingbookepages.sh.sh>
 +#!/bin/bash
 +
 +FEDORA_HOME="/usr/local/fedora"
 +export FEDORA_HOME
 +
 +PATH=$PATH:$FEDORA_HOME/server/bin:$FEDORA_HOME/client/bin:$JAVA_HOME/bin:/bin:/usr/bin:/sbin:/usr/sbin
 +export PATH
 +
 +#parameters
 +
 +#1 book directory
 +#e.g. IMPORT_BASE_DIR="/srv/storage/scansioni/archivio/fgramsci_TO024-00001"
 +IMPORT_BASE_DIR=$1
 +
 +#2 Collection
 +#e.g. COLLEZIONE="openbess:cognetti-C001"
 +COLLEZIONE=$2
 +
 +#3 fedoraAdmin password or test
 +
 +#4 number of pages to ingest or 0 for all
 +
 +IMPORT_VOL_DIR=${IMPORT_BASE_DIR##*/}
 +
 +#### pre-ingesting check ####
 +#############################
 +
 +bookPID=${IMPORT_VOL_DIR/"_"/":"}
 +imagedir="$IMPORT_BASE_DIR""/"
 +echo "============================================================================="
 +echo "Book PID: $bookPID"
 +echo "dir: $imagedir"
 +echo "-----------------------------------------------------------------------------"
 +
 +#check files TIFF
 +
 +SAVEIFS=$IFS
 +IFS=$(echo -en "\n\b")
 +ntiff=0
 +for nfile in $(find "$imagedir"*.tif -type f);
 +do
 +   let "ntiff += 1"
 +   sn=$(printf "%04d" $ntiff)
 +   if [[ "$nfile" != *$sn.tif ]]
 +   then
 +      echo "ERROR file $nfile non corrisponde a $sn"
 +      exit
 +   fi
 +done
 +IFS=$SAVEIFS
 +
 +#check only one OCR directory
 +
 +n=0
 +for ndir in $(find "$imagedir"* -type d);
 +do
 +   let "n += 1"
 +done
 +
 +if [ $n -gt 1 ] || [ $n -lt 1 ]
 +then
 +   echo "ERROR dir TXT non unica"
 +   exit
 +fi
 +
 +ocrdir=$(find "$imagedir"* -type d)
 +
 +#check files TXT
 +
 +ntxt=0
 +SAVEIFS=$IFS
 +IFS=$(echo -en "\n\b")
 +for nfile in $(find "$ocrdir"/*.txt -type f);
 +do
 +   let "ntxt += 1"
 +   st=$(printf "%04d" $ntxt)
 +   if [[ "$nfile" != *$st.txt ]]
 +   then
 +      echo "ERROR file $nfile non corrisponde a $st"
 +      exit
 +   fi
 +done
 +IFS=$SAVEIFS
 +
 +#check same number of files TIFF and TXT
 +
 +if [ $ntiff -ne $ntxt ]
 +then
 +   echo "ERROR files $ntiff TIFF non corrisponde a $ntxt TXT"
 +   exit
 +fi
 +
 +#check PDF
 +
 +n=0
 +SAVEIFS=$IFS
 +IFS=$(echo -en "\n\b")
 +for nfile in $(find "$imagedir"*.pdf -type f);
 +do
 +   let "n += 1"
 +   filepdf="$nfile"
 +done
 +if [ $n -gt 1 ] || [ $n -lt 1 ]
 +then
 +   echo "ERROR file PDF non unico"
 +   exit
 +fi
 +IFS=$SAVEIFS
 +
 +cp $filepdf "/srv/temp/pdf.pdf"
 +filepdf="/srv/temp/pdf.pdf"
 +
 +#check DC and index
 +
 +n=0
 +SAVEIFS=$IFS
 +IFS=$(echo -en "\n\b")
 +for nfile in $(find "$imagedir"*.txt -type f);
 +do
 +   let "n += 1"
 +   filedcindice="$nfile"
 +done
 +if [ $n -gt 1 ] || [ $n -lt 1 ]
 +then
 +   echo "ERROR file DC e Indice non unico"
 +   exit
 +fi
 +IFS=$SAVEIFS
 +
 +#get DC and index
 +
 +cp "$filedcindice" "/srv/temp/fdci.dci"
 +chmod +w "/srv/temp/fdci.dci"
 +#from dos to unix
 +fromdos "/srv/temp/fdci.dci"
 +filedcindice="/srv/temp/fdci.dci"
 +if [ ! -e $filedcindice ] || [ ! -f $filedcindice ] || [ ! -s $filedcindice ]
 +then
 +   if [[ "$3" != test ]]
 +   then
 +      exit
 +   else
 +      echo "ERROR file $filedcindice"
 +   fi
 +fi
 +
 +#check utf-8 or us-ascii
 +filetype=$(file -bi $filedcindice)
 +if [[ "$filetype" != *utf-8 ]] && [[ "$filetype" != *us-ascii ]]
 +then
 +   if [[ "$3" != test ]]
 +   then
 +      exit
 +   else
 +      echo "ERROR file $filedcindice $filetype"
 +   fi
 +fi
 +
 +#cut 3 special chars from begin
 +awk '{if(NR==1)sub(/^\xef\xbb\xbf/,"");print}' "/srv/temp/fdci.dci" > "/srv/temp/fdci.idx"
 +rm "/srv/temp/fdci.dci"
 +filedcindice="/srv/temp/fdci.idx"
 +
 +DCN=( [DCTITLE] [DCCREATOR] [DCSUBJECT] [DCDESCRIPTION] [DCPUBLISHER] [DCDATE] [DCTYPE] [DCFORMAT] [DCLANGUAGE] )
 +DCV[0]=""
 +linind=""
 +ndc=0
 +nind=0
 +while read line; do
 +   lline=${#line}
 +   if [ $lline -gt 0 ]
 +   then
 +      dcvar="${line%%:*}"
 +      dcval="${line#*:}"
 +      pc=${line:0:1}
 +      if [[ "$pc" < "0" ]] || [[ "$pc" > "9" ]]
 +      then
 +         let "ndc += 1"
 +         case $dcvar in
 +            TITLE)
 +               DCV[0]="$dcval"
 +            ;;
 +            CREATOR)
 +               DCV[1]="$dcval"
 +            ;;
 +            SUBJECT)
 +               DCV[2]="$dcval"
 +            ;;
 +            DESCRIPTION)
 +               DCV[3]="$dcval"
 +            ;;
 +            PUBLISHER)
 +               DCV[4]="$dcval"
 +            ;;
 +            DATE)
 +               case $dcval in
 +                  [0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9])
 +                     DCV[5]="$dcval"
 +                  ;;
 +                  [0-9][0-9][0-9][0-9]-[0-9][0-9])
 +                     DCV[5]="$dcval"
 +                  ;;
 +                  [0-9][0-9][0-9][0-9])
 +                     DCV[5]="$dcval"
 +                  ;;
 +                  *)
 +                     echo "ERRORE Formato data: $dcval"
 +                     exit
 +                  ;;
 +               esac
 +            ;;
 +            TYPE)
 +               DCV[6]="$dcval"
 +            ;;
 +            FORMAT)
 +               DCV[7]="$dcval"
 +            ;;
 +            LANGUAGE)
 +               DCV[8]="$dcval"
 +            ;;
 +            *)
 +               echo "ERRORE metadati DC: $dcvar val: $dcval"
 +               exit
 +            ;;
 +         esac
 +      fi
 +   fi
 +done < "$filedcindice"
 +
 +#check DC TITLE
 +
 +ldctitle=${#DCV[0]}
 +if [ $ldctitle -gt 0 ]
 +then
 +   linkind="$bookPID|${DCV[0]}"
 +else
 +   echo "ERRORE Manca DC TITLE"
 +   exit
 +fi
 +
 +#display DC
 +
 +for i in {0..8}
 +do
 +   echo "${DCN[$i]} = ${DCV[$i]}"
 +done
 +
 +#check index
 +
 +while read line; do
 +   lline=${#line}
 +   if [ $lline -gt 0 ]
 +   then
 +      dcvar="${line%%:*}"
 +      dcval="${line#*:}"
 +      pc=${line:0:1}
 +      if !([[ "$pc" < "0" ]] || [[ "$pc" > "9" ]])
 +      then
 +         case $dcvar in
 +            [0-9][0-9][0-9][0-9])
 +               echo "$dcval"" pag.""$dcvar"
 +               let "nind += 1"
 +               linkind="$linkind""||$bookPID-$dcvar|$dcval"
 +            ;;
 +            *)
 +               echo "ERRORE indice: $dcval  pag. $dcvar"
 +               exit
 +            ;;
 +         esac
 +      fi
 +   fi
 +done < "$filedcindice"
 +rm "$filedcindice"
 +
 +#### book ingesting ####
 +########################
 +
 +#create datastream INDEX file
 +
 +fileindex="/srv/temp/idx.idx"
 +if [ -e $fileindex ]; then
 +   rm $fileindex
 +fi
 +echo "$linkind" > $fileindex
 +
 +#create book thumbnail image
 +
 +IFS=$(echo -en "\n\b")
 +filetiff=$(find "$imagedir"*0001.tif -type f)
 +IFS=$SAVEIFS
 +if [ ! -e $filetiff ];
 +then
 +   echo "ERRORE file TIFF 0001 per TN $filetiff"
 +   exit
 +fi
 +
 +#uncompress
 +tiffcp -c none "$filetiff" "/srv/temp/tiff.tmp"
 +
 +#NOT gray scale
 +cp "/srv/temp/tiff.tmp" "/srv/temp/tiff.tif"
 +rm "/srv/temp/tiff.tmp"
 +
 +#create thumbnail
 +filetn="/srv/temp/tnbook.jpg"
 +risconvert=$(convert "/srv/temp/tiff.tif"[0] -thumbnail 200x225 -fuzz 1% +repage -gravity center -format jpg -quality 100  "$filetn")
 +if [ ! -e $filetn ] || [ ! -f $filetn ] || [ ! -s $filetn ]
 +then
 +   echo "ERROR file $filetn : $risconvert"
 +   exit
 +fi
 +#convert to RGB (IE8 compatibility)
 +risconvert=$(convert "$filetn" -colorspace RGB "$filetn")
 +if [ ! -e $filetn ] || [ ! -f $filetn ] || [ ! -s $filetn ]
 +then
 +   echo "ERROR file $filetn : $risconvert"
 +   exit
 +fi
 +
 +#prepare template
 +
 +cp /home/giancarlo/clineFC/import/templateBookMaster.xml /home/giancarlo/clineFC/import/templateBook.xml
 +pidlabel=${DCV[0]}
 +pidlabel=${DCV[0]//"/"/"\/"}
 +pidlabel=${pidlabel//"&"/"\&amp;"}
 +sed -i "s/\[PID\]/$bookPID/g;s/\[PIDLABEL\]/$pidlabel/g;s/\[COLLEZIONE\]/$COLLEZIONE/g" /home/giancarlo/clineFC/import/templateBook.xml
 +
 +for i in {0..8}
 +do
 +   var=${DCN[$i]//"["/"\["}
 +   var=${var//"]"/"\]"}
 +   valo=${DCV[$i]//"/"/"\/"}
 +   valo=${valo//"&"/"\&amp;"}
 +   sed -i "s/$var/$valo/g" /home/giancarlo/clineFC/import/templateBook.xml
 +done
 +ftn=${filetn//"/"/"\/"}
 +findex=${fileindex//"/"/"\/"}
 +fpdf=${filepdf//"/"/"\/"}
 +sed -i "s/\[FILETN\]/$ftn/g;s/\[FILEINDEX\]/$findex/g;s/\[FILEPDF\]/$fpdf/g" /home/giancarlo/clineFC/import/templateBook.xml
 +
 +if [[ "$3" != test ]]
 +then
 +
 +   #ingest page
 +   risultato=$(/usr/local/fedora/client/bin/fedora-ingest.sh f /home/giancarlo/clineFC/import/templateBook.xml info:fedora/fedora-system:FOXML-1.1 fc1.to.cnr.it:8080 fedoraAdmin $3 http)
 +   if [[ "$risultato" == Ingested* ]]
 +   then
 +      echo "$risultato"
 +   else
 +      echo "ERROR INGESTING $risultato"
 +      exit
 +   fi
 +else
 +
 +   #or test
 +   echo "-> INGESTED(test)"
 +fi
 +
 +if [[ "$3" != test ]]
 +then
 +   sleep 120
 +else
 +   echo "NO wait"
 +fi
 +
 +#### pages ingesting ####
 +#########################
 +
 +if [ $4 -gt 0 ]
 +then
 +   finoa=$4
 +else
 +   finoa=ntiff
 +fi
 +
 +for (( npage=1; npage<=finoa; npage++ ))
 +do
 +
 +   #for every page
 +   
 +   snpage=$(printf "%04d" $npage)
 +   pagePID="$bookPID""-""$snpage"
 +   valo=${DCV[0]//"/"/"\/"}
 +   valo=${valo//"&"/"\&amp;"}
 +   pagePIDlabel="$valo"" - page ""$snpage"
 +   IFS=$(echo -en "\n\b")
 +   filetiff=$(find "$imagedir"*$snpage.tif -type f)
 +   IFS=$SAVEIFS
 +   if [ ! -e $filetiff ] || [ ! -f $filetiff ] || [ ! -s $filetiff ]
 +   then
 +      if [[ "$3" != test ]]
 +      then
 +         exit
 +      else
 +         echo "$snpage -> ERROR file $filetiff"
 +      fi
 +   fi
 +   
 +   #uncompress
 +   tiffcp -c none "$filetiff" "/srv/temp/tiff.tmp"
 +   #to gray scale
 +   convert "/srv/temp/tiff.tmp" -colorspace Gray "/srv/temp/tiff.tif"
 +   rm "/srv/temp/tiff.tmp"
 +   filetiff="/srv/temp/tiff.tif"
 +   filetn="/srv/temp/page-tn.jpg"
 +   #page thumbnail
 +   risconvert=$(convert "$filetiff"[0] -thumbnail 200x225 -fuzz 1% +repage -gravity center -format jpg -quality 100  "$filetn")
 +   if [ ! -e $filetn ] || [ ! -f $filetn ] || [ ! -s $filetn ]
 +   then
 +      if [[ "$3" != test ]]
 +      then
 +         exit
 +      else
 +         echo "$snpage -> ERROR file $filetn : $risconvert"
 +      fi
 +   fi
 +   
 +   #page jpeg2k
 +   filejp2="/srv/temp/jp2.jp2"
 +   riscompres=$(kdu_compress -i "$filetiff" -o "$filejp2" -rate 0.5 Clayers=1 Clevels=7 "Cprecincts={256,256},{256,256},{256,256},{128,128},{128,128},{64,64},{64,64},{32,32},{16,16}" "Corder=RPCL" "ORGgen_plt=yes" "ORGtparts=R" "Cblk={32,32}" Cuse_sop=yes)
 +   if [ ! -e $filejp2 ] || [ ! -f $filejp2 ] || [ ! -s $filejp2 ]
 +   then
 +      if [[ "$3" != test ]]
 +      then
 +         exit
 +      else
 +         echo "$snpage -> ERROR file $filejp2 : $riscompres"
 +      fi
 +   fi
 +
 +   #OCR file
 +   IFS=$(echo -en "\n\b")
 +   fileocr=$(find "$ocrdir"/*$snpage.txt -type f)
 +   IFS=$SAVEIFS
 +   #cut FF &#12 oct 014 char
 +   tr -d '\014' < "$fileocr" > "/srv/temp/ocr.tmp"
 +   #from dos to unix
 +   fromdos "/srv/temp/ocr.tmp"
 +   fileocrtmp="/srv/temp/ocr.tmp"
 +   if [ ! -e $fileocrtmp ] || [ ! -f $fileocrtmp ]
 +   then
 +      if [[ "$3" != test ]]
 +      then
 +         exit
 +      else
 +         echo "$snpage -> ERROR file $fileocrtmp"
 +      fi
 +   fi
 +   filel=$(stat -c %s $fileocrtmp)
 +   filenull=4
 +   #check zero lenght
 +   if [ ! -s $fileocrtmp ] ||  [ "$filel" -le "$filenull" ]
 +   then
 +      cp blank.txt $fileocrtmp
 +      echo "blank file OCR"
 +   fi
 +   #check utf-8 or us-ascii
 +   filetype=$(file -bi $fileocrtmp)
 +   if [[ "$filetype" != *utf-8 ]] && [[ "$filetype" != *us-ascii ]]
 +   then
 +      if [[ "$3" != test ]]
 +      then
 +         exit
 +      else
 +         echo "$snpage -> ERROR file $fileocrtmp"
 +      fi
 +   fi
 +   awk '{if(NR==1)sub(/^\xef\xbb\xbf/,"");print}' "/srv/temp/ocr.tmp" > "/srv/temp/ocr.ocr"
 +   rm "/srv/temp/ocr.tmp"
 +   fileocrtmp="/srv/temp/ocr.ocr"
 +   
 +   #prepare template
 +   cp /home/giancarlo/clineFC/import/templatePageMaster.xml /home/giancarlo/clineFC/import/templatePage.xml
 +   sed -i "s/\[PID\]/$pagePID/g;s/\[PIDLABEL\]/$pagePIDlabel/g" /home/giancarlo/clineFC/import/templatePage.xml
 +   sed -i "s/\[DCTITLE\]/$pagePIDlabel/g;s/\[PIDBOOK\]/$bookPID/g" /home/giancarlo/clineFC/import/templatePage.xml
 +   
 +   #without DCTITLE
 +   for i in {1..8}
 +   do
 +      var=${DCN[$i]//"["/"\["}
 +      var=${var//"]"/"\]"}
 +      valo=${DCV[$i]//"/"/"\/"}
 +      valo=${valo//"&"/"\&amp;"}
 +      sed -i "s/$var/$valo/g" /home/giancarlo/clineFC/import/templatePage.xml
 +   done
 +   
 +   ftn=${filetn//"/"/"\/"}
 +   fjp2=${filejp2//"/"/"\/"}
 +   focr=${fileocrtmp//"/"/"\/"}
 +   ftiff=${filetiff//"/"/"\/"}
 +   sed -i "s/\[FILETN\]/$ftn/g;s/\[FILEJP2\]/$fjp2/g;s/\[FILEOCR\]/$focr/g" /home/giancarlo/clineFC/import/templatePage.xml
 +   sed -i "s/\[FILETIFF\]/$ftiff/g" /home/giancarlo/clineFC/import/templatePage.xml
 +   
 +   if [[ "$3" != test ]]
 +   then
 +
 +      #ingest page
 +      risultato=$(/usr/local/fedora/client/bin/fedora-ingest.sh f /home/giancarlo/clineFC/import/templatePage.xml info:fedora/fedora-system:FOXML-1.1 fc1.to.cnr.it:8080 fedoraAdmin $3 http)
 +      if [[ "$risultato" == Ingested* ]]
 +      then
 +         echo "$snpage -> $risultato"
 +      else
 +         echo "$snpage -> ERROR INGESTING $risultato"
 +         exit
 +      fi
 +   else
 +
 +      #or test
 +      echo "$snpage -> INGESTED(test)"
 +   fi
 +done
 </code> </code>
 </WRAP> </WRAP>
 
 
ingesting/ingbscript.txt ยท Last modified: 2012/11/27 11:02 by giancarlo

Developers: CNR IRCrES IT Office and Library
Giancarlo Birello (giancarlo.birello _@_ ircres.cnr.it) and Anna Perin (anna.perin _@_ ircres.cnr.it)
DigiBess is licensed under: Creative Commons License
Recent changes RSS feed Creative Commons License Valid XHTML 1.0 Valid CSS Driven by DokuWiki
Drupal Garland Theme for Dokuwiki