====== Manage DublinCore by CSV, BASH and CRUD ======
\\
**CSV -> DC.xml -> update datastream**
* spreadsheet:
* first row PID, dc:title, dc:date, ...
* an object per row, multiple values into same cell separated by |
* convert spreadsheet to CSV using § as field separator, no text delimiter
* run CSV2DC.sh script to convert data into files ready for CRUD replace DS
#!/bin/bash
INPUT=$1
OLDIFS=$IFS
IFS='§'
[ ! -f $INPUT ] && { echo "$INPUT file not found"; exit 99; }
while read -a LABELS
do
for index in ${!LABELS[@]}; do
echo $index/${LABELS[index]}
done
break
done < $INPUT
IFS=$OLDIFS
OLDIFS=$IFS
IFS='§'
[ ! -f $INPUT ] && { echo "$INPUT file not found"; exit 99; }
I=1
while read -a VALUES
do
test $I -eq 1 && ((I=I+1)) && continue
test -z ${VALUES[0]} && continue
IDENTIFIER=${VALUES[0]}
FNAME=${IDENTIFIER//:/_}
shopt -s extglob
FNAME="${FNAME##*( )}"
FNAME="${FNAME%%*( )}"
shopt -u extglob
FNAME=$FNAME'_DC.xml'
touch $FNAME
echo $FNAME
echo '' > $FNAME
for index in ${!VALUES[@]}; do
test $index -eq 0 && continue
VALUE=${VALUES[index]//&/&}
IFS='|' read -ra ELEM <<< "$VALUE"
for i in "${ELEM[@]}"; do
i_trimmed="$(echo -e "${i}"|sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')"
test ! -z ${i_trimmed} && echo ' <'"${LABELS[index]}"'>'"${i_trimmed}"''"${LABELS[index]}"'>' >> $FNAME
done
IFS='§'
done
echo '' >> $FNAME
done < $INPUT
IFS=$OLDIFS
exit
Created as much files as rows (less 1) i.e. smtextlib_17052_DC.xml, smtextlib_17053_DC.xml, ...
* Move xml files to empty dir as ~/book/DC then run CRUD to replace DC datastreams
drush -u 1 -v islandora_datastream_crud_push_datastreams --datastreams_source_directory=/home/giancarlo/book/DC --datastreams_crud_log=/home/giancarlo/crud.log --update_object_label
\\
**Retrieve multiple DC.xml -> CSV**
* Run CRUD to list PIDs of specific collection:
drush -u 1 -v islandora_datastream_crud_fetch_pids --collection=smarch:lds2002 --pid_file=/home/giancarlo/book/PID/pids.txt
* Run CRUD to extract DC of listed PIDs:
drush -u 1 -v islandora_datastream_crud_fetch_datastreams --pid_file=/home/giancarlo/book/PID/pids.txt --dsid=DC --datastreams_directory=/home/giancarlo/book/DC
* Create DCElements.txt:
dc:title§dc:date§dc:coverage§dc:description§dc:source§dc:subject§dc:contributor§dc:identifier§dc:creator§dc:publisher§dc:type§dc:language§dc:format§dc:relation§dc:rights
* Single object DC bash converter, DCXML2CSV.sh:
#!/bin/bash
ELEMENTS=$1
OLDIFS=$IFS
IFS='§'
[ ! -f $ELEMENTS ] && { echo "$ELEMENTS file not found"; exit 99; }
while read -a LABELS
do
for index in ${!LABELS[@]}; do
echo $index/${LABELS[index]}
done
break
done < $ELEMENTS
IFS=$OLDIFS
OUTFILE=$3
POS=$(( ${#LABELS[*]} - 1 ))
LAST=${LABELS[$POS]}
XMLFILE=$2
PID=$(xmlstarlet sel -t -v "//dc:identifier[1]" $XMLFILE)
echo -n $PID"§" >> $OUTFILE
for index in ${!LABELS[@]}; do
COUNT=$(xmlstarlet sel -t -v "count(//${LABELS[index]})" $XMLFILE)
counter=1
while [ $counter -le $COUNT ]
do
VALUE=$(xmlstarlet sel -t -v "//${LABELS[index]}[$counter]" $XMLFILE)
if [ $counter -eq $COUNT ]
then
echo -n $VALUE >> $OUTFILE
else
echo -n $VALUE"|" >> $OUTFILE
fi
((counter++))
done
if [[ ${LABELS[index]} == $LAST ]]
then
echo "" >> $OUTFILE
else
echo -n "§" >> $OUTFILE
fi
done
exit
* Run extract.sh for multiple DCs to single csv file:
#!/bin/bash
# $1 file ELEMENTS
# $2 file DIR
# $3 file OUTPUT
ELEMENTS=$1
OUTFILE=$3
echo -n "PID§" > $OUTFILE
cat $ELEMENTS >> $OUTFILE
DCDIR=$2
SAVEIFS=$IFS
IFS=$(echo -en "\n\b")
for dc in $(find $DCDIR/smtextlib_* -type f);
do
echo $dc
./DCXML2CSV.sh $1 $dc $3
done
exit
./extract.sh DCElements.txt DC DC/2002.csv