Both sides previous revision
Previous revision
Next revision
|
Previous revision
|
ingesting:ingbscript [2012/11/27 10:17] giancarlo |
ingesting:ingbscript [2012/11/27 11:02] (current) giancarlo |
| |
| |
* Multiple book ingesting script | * Multiple book ingesting script (same collection) |
| |
<WRAP prewrap center> | <WRAP prewrap center> |
echo "=====> $nomedir" | echo "=====> $nomedir" |
./ingbookepages.sh "$ndir" $2 $3 $4 | ./ingbookepages.sh "$ndir" $2 $3 $4 |
| done |
| </code> |
| </WRAP> |
| |
| * Single book ingesting script |
| |
| <WRAP prewrap center> |
| <code bash ingbookepages.sh.sh> |
| #!/bin/bash |
| |
| FEDORA_HOME="/usr/local/fedora" |
| export FEDORA_HOME |
| |
| PATH=$PATH:$FEDORA_HOME/server/bin:$FEDORA_HOME/client/bin:$JAVA_HOME/bin:/bin:/usr/bin:/sbin:/usr/sbin |
| export PATH |
| |
| #parameters |
| |
| #1 book directory |
| #e.g. IMPORT_BASE_DIR="/srv/storage/scansioni/archivio/fgramsci_TO024-00001" |
| IMPORT_BASE_DIR=$1 |
| |
| #2 Collection |
| #e.g. COLLEZIONE="openbess:cognetti-C001" |
| COLLEZIONE=$2 |
| |
| #3 fedoraAdmin password or test |
| |
| #4 number of pages to ingest or 0 for all |
| |
| IMPORT_VOL_DIR=${IMPORT_BASE_DIR##*/} |
| |
| #### pre-ingesting check #### |
| ############################# |
| |
| bookPID=${IMPORT_VOL_DIR/"_"/":"} |
| imagedir="$IMPORT_BASE_DIR""/" |
| echo "=============================================================================" |
| echo "Book PID: $bookPID" |
| echo "dir: $imagedir" |
| echo "-----------------------------------------------------------------------------" |
| |
| #check files TIFF |
| |
| SAVEIFS=$IFS |
| IFS=$(echo -en "\n\b") |
| ntiff=0 |
| for nfile in $(find "$imagedir"*.tif -type f); |
| do |
| let "ntiff += 1" |
| sn=$(printf "%04d" $ntiff) |
| if [[ "$nfile" != *$sn.tif ]] |
| then |
| echo "ERROR file $nfile non corrisponde a $sn" |
| exit |
| fi |
| done |
| IFS=$SAVEIFS |
| |
| #check only one OCR directory |
| |
| n=0 |
| for ndir in $(find "$imagedir"* -type d); |
| do |
| let "n += 1" |
| done |
| |
| if [ $n -gt 1 ] || [ $n -lt 1 ] |
| then |
| echo "ERROR dir TXT non unica" |
| exit |
| fi |
| |
| ocrdir=$(find "$imagedir"* -type d) |
| |
| #check files TXT |
| |
| ntxt=0 |
| SAVEIFS=$IFS |
| IFS=$(echo -en "\n\b") |
| for nfile in $(find "$ocrdir"/*.txt -type f); |
| do |
| let "ntxt += 1" |
| st=$(printf "%04d" $ntxt) |
| if [[ "$nfile" != *$st.txt ]] |
| then |
| echo "ERROR file $nfile non corrisponde a $st" |
| exit |
| fi |
| done |
| IFS=$SAVEIFS |
| |
| #check same number of files TIFF and TXT |
| |
| if [ $ntiff -ne $ntxt ] |
| then |
| echo "ERROR files $ntiff TIFF non corrisponde a $ntxt TXT" |
| exit |
| fi |
| |
| #check PDF |
| |
| n=0 |
| SAVEIFS=$IFS |
| IFS=$(echo -en "\n\b") |
| for nfile in $(find "$imagedir"*.pdf -type f); |
| do |
| let "n += 1" |
| filepdf="$nfile" |
| done |
| if [ $n -gt 1 ] || [ $n -lt 1 ] |
| then |
| echo "ERROR file PDF non unico" |
| exit |
| fi |
| IFS=$SAVEIFS |
| |
| cp $filepdf "/srv/temp/pdf.pdf" |
| filepdf="/srv/temp/pdf.pdf" |
| |
| #check DC and index |
| |
| n=0 |
| SAVEIFS=$IFS |
| IFS=$(echo -en "\n\b") |
| for nfile in $(find "$imagedir"*.txt -type f); |
| do |
| let "n += 1" |
| filedcindice="$nfile" |
| done |
| if [ $n -gt 1 ] || [ $n -lt 1 ] |
| then |
| echo "ERROR file DC e Indice non unico" |
| exit |
| fi |
| IFS=$SAVEIFS |
| |
| #get DC and index |
| |
| cp "$filedcindice" "/srv/temp/fdci.dci" |
| chmod +w "/srv/temp/fdci.dci" |
| #from dos to unix |
| fromdos "/srv/temp/fdci.dci" |
| filedcindice="/srv/temp/fdci.dci" |
| if [ ! -e $filedcindice ] || [ ! -f $filedcindice ] || [ ! -s $filedcindice ] |
| then |
| if [[ "$3" != test ]] |
| then |
| exit |
| else |
| echo "ERROR file $filedcindice" |
| fi |
| fi |
| |
| #check utf-8 or us-ascii |
| filetype=$(file -bi $filedcindice) |
| if [[ "$filetype" != *utf-8 ]] && [[ "$filetype" != *us-ascii ]] |
| then |
| if [[ "$3" != test ]] |
| then |
| exit |
| else |
| echo "ERROR file $filedcindice $filetype" |
| fi |
| fi |
| |
| #cut 3 special chars from begin |
| awk '{if(NR==1)sub(/^\xef\xbb\xbf/,"");print}' "/srv/temp/fdci.dci" > "/srv/temp/fdci.idx" |
| rm "/srv/temp/fdci.dci" |
| filedcindice="/srv/temp/fdci.idx" |
| |
| DCN=( [DCTITLE] [DCCREATOR] [DCSUBJECT] [DCDESCRIPTION] [DCPUBLISHER] [DCDATE] [DCTYPE] [DCFORMAT] [DCLANGUAGE] ) |
| DCV[0]="" |
| linind="" |
| ndc=0 |
| nind=0 |
| while read line; do |
| lline=${#line} |
| if [ $lline -gt 0 ] |
| then |
| dcvar="${line%%:*}" |
| dcval="${line#*:}" |
| pc=${line:0:1} |
| if [[ "$pc" < "0" ]] || [[ "$pc" > "9" ]] |
| then |
| let "ndc += 1" |
| case $dcvar in |
| TITLE) |
| DCV[0]="$dcval" |
| ;; |
| CREATOR) |
| DCV[1]="$dcval" |
| ;; |
| SUBJECT) |
| DCV[2]="$dcval" |
| ;; |
| DESCRIPTION) |
| DCV[3]="$dcval" |
| ;; |
| PUBLISHER) |
| DCV[4]="$dcval" |
| ;; |
| DATE) |
| case $dcval in |
| [0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]) |
| DCV[5]="$dcval" |
| ;; |
| [0-9][0-9][0-9][0-9]-[0-9][0-9]) |
| DCV[5]="$dcval" |
| ;; |
| [0-9][0-9][0-9][0-9]) |
| DCV[5]="$dcval" |
| ;; |
| *) |
| echo "ERRORE Formato data: $dcval" |
| exit |
| ;; |
| esac |
| ;; |
| TYPE) |
| DCV[6]="$dcval" |
| ;; |
| FORMAT) |
| DCV[7]="$dcval" |
| ;; |
| LANGUAGE) |
| DCV[8]="$dcval" |
| ;; |
| *) |
| echo "ERRORE metadati DC: $dcvar val: $dcval" |
| exit |
| ;; |
| esac |
| fi |
| fi |
| done < "$filedcindice" |
| |
| #check DC TITLE |
| |
| ldctitle=${#DCV[0]} |
| if [ $ldctitle -gt 0 ] |
| then |
| linkind="$bookPID|${DCV[0]}" |
| else |
| echo "ERRORE Manca DC TITLE" |
| exit |
| fi |
| |
| #display DC |
| |
| for i in {0..8} |
| do |
| echo "${DCN[$i]} = ${DCV[$i]}" |
| done |
| |
| #check index |
| |
| while read line; do |
| lline=${#line} |
| if [ $lline -gt 0 ] |
| then |
| dcvar="${line%%:*}" |
| dcval="${line#*:}" |
| pc=${line:0:1} |
| if !([[ "$pc" < "0" ]] || [[ "$pc" > "9" ]]) |
| then |
| case $dcvar in |
| [0-9][0-9][0-9][0-9]) |
| echo "$dcval"" pag.""$dcvar" |
| let "nind += 1" |
| linkind="$linkind""||$bookPID-$dcvar|$dcval" |
| ;; |
| *) |
| echo "ERRORE indice: $dcval pag. $dcvar" |
| exit |
| ;; |
| esac |
| fi |
| fi |
| done < "$filedcindice" |
| rm "$filedcindice" |
| |
| #### book ingesting #### |
| ######################## |
| |
| #create datastream INDEX file |
| |
| fileindex="/srv/temp/idx.idx" |
| if [ -e $fileindex ]; then |
| rm $fileindex |
| fi |
| echo "$linkind" > $fileindex |
| |
| #create book thumbnail image |
| |
| IFS=$(echo -en "\n\b") |
| filetiff=$(find "$imagedir"*0001.tif -type f) |
| IFS=$SAVEIFS |
| if [ ! -e $filetiff ]; |
| then |
| echo "ERRORE file TIFF 0001 per TN $filetiff" |
| exit |
| fi |
| |
| #uncompress |
| tiffcp -c none "$filetiff" "/srv/temp/tiff.tmp" |
| |
| #NOT gray scale |
| cp "/srv/temp/tiff.tmp" "/srv/temp/tiff.tif" |
| rm "/srv/temp/tiff.tmp" |
| |
| #create thumbnail |
| filetn="/srv/temp/tnbook.jpg" |
| risconvert=$(convert "/srv/temp/tiff.tif"[0] -thumbnail 200x225 -fuzz 1% +repage -gravity center -format jpg -quality 100 "$filetn") |
| if [ ! -e $filetn ] || [ ! -f $filetn ] || [ ! -s $filetn ] |
| then |
| echo "ERROR file $filetn : $risconvert" |
| exit |
| fi |
| #convert to RGB (IE8 compatibility) |
| risconvert=$(convert "$filetn" -colorspace RGB "$filetn") |
| if [ ! -e $filetn ] || [ ! -f $filetn ] || [ ! -s $filetn ] |
| then |
| echo "ERROR file $filetn : $risconvert" |
| exit |
| fi |
| |
| #prepare template |
| |
| cp /home/giancarlo/clineFC/import/templateBookMaster.xml /home/giancarlo/clineFC/import/templateBook.xml |
| pidlabel=${DCV[0]} |
| pidlabel=${DCV[0]//"/"/"\/"} |
| pidlabel=${pidlabel//"&"/"\&"} |
| sed -i "s/\[PID\]/$bookPID/g;s/\[PIDLABEL\]/$pidlabel/g;s/\[COLLEZIONE\]/$COLLEZIONE/g" /home/giancarlo/clineFC/import/templateBook.xml |
| |
| for i in {0..8} |
| do |
| var=${DCN[$i]//"["/"\["} |
| var=${var//"]"/"\]"} |
| valo=${DCV[$i]//"/"/"\/"} |
| valo=${valo//"&"/"\&"} |
| sed -i "s/$var/$valo/g" /home/giancarlo/clineFC/import/templateBook.xml |
| done |
| ftn=${filetn//"/"/"\/"} |
| findex=${fileindex//"/"/"\/"} |
| fpdf=${filepdf//"/"/"\/"} |
| sed -i "s/\[FILETN\]/$ftn/g;s/\[FILEINDEX\]/$findex/g;s/\[FILEPDF\]/$fpdf/g" /home/giancarlo/clineFC/import/templateBook.xml |
| |
| if [[ "$3" != test ]] |
| then |
| |
| #ingest page |
| risultato=$(/usr/local/fedora/client/bin/fedora-ingest.sh f /home/giancarlo/clineFC/import/templateBook.xml info:fedora/fedora-system:FOXML-1.1 fc1.to.cnr.it:8080 fedoraAdmin $3 http) |
| if [[ "$risultato" == Ingested* ]] |
| then |
| echo "$risultato" |
| else |
| echo "ERROR INGESTING $risultato" |
| exit |
| fi |
| else |
| |
| #or test |
| echo "-> INGESTED(test)" |
| fi |
| |
| if [[ "$3" != test ]] |
| then |
| sleep 120 |
| else |
| echo "NO wait" |
| fi |
| |
| #### pages ingesting #### |
| ######################### |
| |
| if [ $4 -gt 0 ] |
| then |
| finoa=$4 |
| else |
| finoa=ntiff |
| fi |
| |
| for (( npage=1; npage<=finoa; npage++ )) |
| do |
| |
| #for every page |
| |
| snpage=$(printf "%04d" $npage) |
| pagePID="$bookPID""-""$snpage" |
| valo=${DCV[0]//"/"/"\/"} |
| valo=${valo//"&"/"\&"} |
| pagePIDlabel="$valo"" - page ""$snpage" |
| IFS=$(echo -en "\n\b") |
| filetiff=$(find "$imagedir"*$snpage.tif -type f) |
| IFS=$SAVEIFS |
| if [ ! -e $filetiff ] || [ ! -f $filetiff ] || [ ! -s $filetiff ] |
| then |
| if [[ "$3" != test ]] |
| then |
| exit |
| else |
| echo "$snpage -> ERROR file $filetiff" |
| fi |
| fi |
| |
| #uncompress |
| tiffcp -c none "$filetiff" "/srv/temp/tiff.tmp" |
| #to gray scale |
| convert "/srv/temp/tiff.tmp" -colorspace Gray "/srv/temp/tiff.tif" |
| rm "/srv/temp/tiff.tmp" |
| filetiff="/srv/temp/tiff.tif" |
| filetn="/srv/temp/page-tn.jpg" |
| #page thumbnail |
| risconvert=$(convert "$filetiff"[0] -thumbnail 200x225 -fuzz 1% +repage -gravity center -format jpg -quality 100 "$filetn") |
| if [ ! -e $filetn ] || [ ! -f $filetn ] || [ ! -s $filetn ] |
| then |
| if [[ "$3" != test ]] |
| then |
| exit |
| else |
| echo "$snpage -> ERROR file $filetn : $risconvert" |
| fi |
| fi |
| |
| #page jpeg2k |
| filejp2="/srv/temp/jp2.jp2" |
| riscompres=$(kdu_compress -i "$filetiff" -o "$filejp2" -rate 0.5 Clayers=1 Clevels=7 "Cprecincts={256,256},{256,256},{256,256},{128,128},{128,128},{64,64},{64,64},{32,32},{16,16}" "Corder=RPCL" "ORGgen_plt=yes" "ORGtparts=R" "Cblk={32,32}" Cuse_sop=yes) |
| if [ ! -e $filejp2 ] || [ ! -f $filejp2 ] || [ ! -s $filejp2 ] |
| then |
| if [[ "$3" != test ]] |
| then |
| exit |
| else |
| echo "$snpage -> ERROR file $filejp2 : $riscompres" |
| fi |
| fi |
| |
| #OCR file |
| IFS=$(echo -en "\n\b") |
| fileocr=$(find "$ocrdir"/*$snpage.txt -type f) |
| IFS=$SAVEIFS |
| #cut FF  oct 014 char |
| tr -d '\014' < "$fileocr" > "/srv/temp/ocr.tmp" |
| #from dos to unix |
| fromdos "/srv/temp/ocr.tmp" |
| fileocrtmp="/srv/temp/ocr.tmp" |
| if [ ! -e $fileocrtmp ] || [ ! -f $fileocrtmp ] |
| then |
| if [[ "$3" != test ]] |
| then |
| exit |
| else |
| echo "$snpage -> ERROR file $fileocrtmp" |
| fi |
| fi |
| filel=$(stat -c %s $fileocrtmp) |
| filenull=4 |
| #check zero lenght |
| if [ ! -s $fileocrtmp ] || [ "$filel" -le "$filenull" ] |
| then |
| cp blank.txt $fileocrtmp |
| echo "blank file OCR" |
| fi |
| #check utf-8 or us-ascii |
| filetype=$(file -bi $fileocrtmp) |
| if [[ "$filetype" != *utf-8 ]] && [[ "$filetype" != *us-ascii ]] |
| then |
| if [[ "$3" != test ]] |
| then |
| exit |
| else |
| echo "$snpage -> ERROR file $fileocrtmp" |
| fi |
| fi |
| awk '{if(NR==1)sub(/^\xef\xbb\xbf/,"");print}' "/srv/temp/ocr.tmp" > "/srv/temp/ocr.ocr" |
| rm "/srv/temp/ocr.tmp" |
| fileocrtmp="/srv/temp/ocr.ocr" |
| |
| #prepare template |
| cp /home/giancarlo/clineFC/import/templatePageMaster.xml /home/giancarlo/clineFC/import/templatePage.xml |
| sed -i "s/\[PID\]/$pagePID/g;s/\[PIDLABEL\]/$pagePIDlabel/g" /home/giancarlo/clineFC/import/templatePage.xml |
| sed -i "s/\[DCTITLE\]/$pagePIDlabel/g;s/\[PIDBOOK\]/$bookPID/g" /home/giancarlo/clineFC/import/templatePage.xml |
| |
| #without DCTITLE |
| for i in {1..8} |
| do |
| var=${DCN[$i]//"["/"\["} |
| var=${var//"]"/"\]"} |
| valo=${DCV[$i]//"/"/"\/"} |
| valo=${valo//"&"/"\&"} |
| sed -i "s/$var/$valo/g" /home/giancarlo/clineFC/import/templatePage.xml |
| done |
| |
| ftn=${filetn//"/"/"\/"} |
| fjp2=${filejp2//"/"/"\/"} |
| focr=${fileocrtmp//"/"/"\/"} |
| ftiff=${filetiff//"/"/"\/"} |
| sed -i "s/\[FILETN\]/$ftn/g;s/\[FILEJP2\]/$fjp2/g;s/\[FILEOCR\]/$focr/g" /home/giancarlo/clineFC/import/templatePage.xml |
| sed -i "s/\[FILETIFF\]/$ftiff/g" /home/giancarlo/clineFC/import/templatePage.xml |
| |
| if [[ "$3" != test ]] |
| then |
| |
| #ingest page |
| risultato=$(/usr/local/fedora/client/bin/fedora-ingest.sh f /home/giancarlo/clineFC/import/templatePage.xml info:fedora/fedora-system:FOXML-1.1 fc1.to.cnr.it:8080 fedoraAdmin $3 http) |
| if [[ "$risultato" == Ingested* ]] |
| then |
| echo "$snpage -> $risultato" |
| else |
| echo "$snpage -> ERROR INGESTING $risultato" |
| exit |
| fi |
| else |
| |
| #or test |
| echo "$snpage -> INGESTED(test)" |
| fi |
done | done |
</code> | </code> |
</WRAP> | </WRAP> |