#!/bin/bash

# Public domain notice for all NCBI EDirect scripts is located at:
# https://www.ncbi.nlm.nih.gov/books/NBK179288/#chapter6.Public_Domain_Notice

# archive-pids

total_start=$(date "+%s")

pth=$( dirname "$0" )

case "$pth" in
  /* )
    ;; # already absolute
  *  )
    pth=$(cd "$pth" && pwd)
    ;;
esac

case ":$PATH:" in
  *:"$pth":* )
    ;;
  * )
    PATH="$PATH:$pth"
    export PATH
    ;;
esac

hasgo=$( command -v go )
if [ ! -x "$hasgo" ]
then
  echo "ERROR: The Go (golang) compiler must be installed locally in order to process the data, EXITING" >&2
  exit 1
fi

# database-specific parameters

dbase="pubmed"
fields="PMCID"

# control flags set by command-line arguments

useFtp=true
useHttps=false

scratch=false

download=true

e2index=false
e2invert=false
e2merge=false
e2post=false

while [ $# -gt 0 ]
do
  case "$1" in
    daily | -daily )
      e2index=true
      e2invert=true
      shift
      ;;
    index | -index | reindex | -reindex )
      e2index=true
      e2invert=true
      e2merge=true
      e2post=true
      shift
      ;;
    clean | -clean | clear | -clear | scrub | -scrub | scour | -scour | scratch | -scratch | erase | -erase )
      # delete Scratch directories
      scratch=true
      shift
      ;;
    -ftp )
      useFtp=true
      useHttps=false
      export EDIRECT_NO_ASPERA=true
      shift
      ;;
    -http | -https )
      useFtp=false
      useHttps=true
      shift
      ;;
    * )
      break
      ;;
  esac
done

if [ "$scratch" == true ] && [ "$e2index" == true ]
then
  echo "ERROR: Cleaning and indexing must be done in separate commands, EXITING" >&2
  exit 1
fi

if [ "$scratch" = true ]
then
  pm-clean -db "$dbase" -fields "$fields" -scratch
  exit 0
fi

# get path to local folder

osname=$( uname -s | sed -e 's/_NT-.*$/_NT/; s/^MINGW[0-9]*/CYGWIN/' )

GetLocalArchiveFolder() {

  dbs="$1"
  fld="$2"

  # find selected local archive folder from environment variables or configuration file
  target=$( rchive -local "$dbs" "$fld" )

  if [ -z "$target" ] || [ "$target" = "" ]
  then
    echo "ERROR: Must supply path to local data by setting EDIRECT_LOCAL_ARCHIVE environment variable" >&2
    exit 1
  fi

  if [ -n "$osname" ] && [ "$osname" = "CYGWIN_NT" -a -x /bin/cygpath ]
  then
    target=$( cygpath -w "$target" )
  fi

  # remove trailing slash
  target=${target%/}

  echo "$target"
}

date
echo "" >&2

pm-setup -db "$dbase"

echo "Preparing Drives" >&2
pm-prepare -db "$dbase"
echo "" >&2

archiveBase=$( GetLocalArchiveFolder "$dbase" "Archive" )
dataBase=$( GetLocalArchiveFolder "$dbase" "Data" )
extrasBase=$( GetLocalArchiveFolder "$dbase" "Extras" )
indexBase=$( GetLocalArchiveFolder "$dbase" "Index" )
invertBase=$( GetLocalArchiveFolder "$dbase" "Invert" )
mergedBase=$( GetLocalArchiveFolder "$dbase" "Merged" )
postingsBase=$( GetLocalArchiveFolder "$dbase" "Postings" )
scratchBase=$( GetLocalArchiveFolder "$dbase" "Scratch" )
currentBase=$( GetLocalArchiveFolder "$dbase" "Current" )
indexedBase=$( GetLocalArchiveFolder "$dbase" "Indexed" )
invertedBase=$( GetLocalArchiveFolder "$dbase" "Inverted" )

IDX=""
INV=""
MRG=""
PST=""

if [ -d "${scratchBase}" ]
then
  echo "Removing Previous Indices" >&2

  target="${currentBase}"
  find "$target" -name "*.xml" -delete
  find "$target" -name "*.xml.gz" -delete

  target="${indexedBase}"
  find "$target" -name "*.e2x" -delete
  find "$target" -name "*.e2x.gz" -delete

  target="${invertedBase}"
  find "$target" -name "*.inv" -delete
  find "$target" -name "*.inv.gz" -delete
fi

if [ -d "${mergedBase}" ]
then
  target="${mergedBase}"
  find "$target" -name "*.mrg" -delete
  find "$target" -name "*.mrg.gz" -delete
fi

if [ "$e2index" = true ]
then
  seconds_start=$(date "+%s")
  echo "Indexing PMID-PMCID" >&2

  cd "${indexedBase}"

  target="${indexedBase}"

  for dir in "/Volumes/working/pubmed/Invert"/*
  do
    if [ -d "$dir" ]
    then
      gunzip -c "$dir"/*.inv.gz |
      xtract -pattern InvDocument -KEY InvKey \
        -block InvIDs/PMCID \
          -tab "," -element PMCID "&KEY" -deq "\n"
    fi
  done |
  go run "$pth/extern/prep-pmcid.go" |
  go run "$pth/extern/prep-finish.go" 500000 "$target" "pmcid"

  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  IDX=$seconds
  echo "IDX $IDX seconds" >&2
  echo "" >&2
  sleep 1
fi

if [ "$e2invert" = true ]
then
  seconds_start=$(date "+%s")
  echo "Inverting PMID-PMCID" >&2

  if [ -d "${indexedBase}" ]
  then
    cd "${indexedBase}"

    target="${invertedBase}"
    for fl in *.e2x.gz
    do
      base=${fl%.e2x.gz}
      echo "$base.inv"
      gunzip -c "$fl" |
      rchive -e2invert |
      gzip -1 > "$target/$base.inv.gz"
      sleep 1
    done
  fi

  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  INV=$seconds
  echo "INV $INV seconds" >&2
  echo "" >&2
  sleep 1
fi

if [ "$e2merge" = true ]
then
  seconds_start=$(date "+%s")
  echo "Merging Inverted Indices" >&2

  if [ -d "${invertedBase}" ]
  then
    cd "${invertedBase}"

    target="${mergedBase}"
    osname=`uname -s | sed -e 's/_NT-.*$/_NT/; s/^MINGW[0-9]*/CYGWIN/'`
    if [ "$osname" = "CYGWIN_NT" -a -x /bin/cygpath ]
    then
      target=`cygpath -w "$target"`
    fi
    target=${target%/}
    rchive -gzip -mergelink "$target" *.inv.gz
  fi

  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  MRG=$seconds
  echo "MRG $MRG seconds" >&2
  echo "" >&2
  sleep 1
fi

if [ "$e2post" = true ]
then
  seconds_start=$(date "+%s")
  echo "Producing Postings Files" >&2

  if [ -d "${mergedBase}" ]
  then
    cd "${mergedBase}"

    target="${postingsBase}"
    osname=`uname -s | sed -e 's/_NT-.*$/_NT/; s/^MINGW[0-9]*/CYGWIN/'`
    if [ "$osname" = "CYGWIN_NT" -a -x /bin/cygpath ]
    then
      target=`cygpath -w "$target"`
    fi
    target=${target%/}
    for fl in *.mrg.gz
    do
      echo "$fl"
    done |
    sort |
    xargs -n 100 echo |
    while read files
    do
      rchive -db pubmed -promotelink "$target" "$fields" $files
    done
  fi

  seconds_end=$(date "+%s")
  seconds=$((seconds_end - seconds_start))
  PST=$seconds
  echo "PST $PST seconds" >&2
  echo "" >&2
  sleep 1
fi

wait

# check postings
okay=""
if [ "$e2post" = true ]
then
  okay=$( echo 2539356 | xlink -db pubmed -target PMCID | grep -w 209839 )
  if [ -n "$okay" ]
  then
    echo "Archive and Index are OK" >&2
    echo "" >&2
  fi
fi

if [ "$e2post" = true ] && [ -n "$okay" ] && [ -d "${mergedBase}" ]
then
  target="${mergedBase}"
  find "$target" -name "*.mrg" -delete
  find "$target" -name "*.mrg.gz" -delete
fi

cd

echo "ARCHIVE-PIDS " >&2

echo "" >&2

PrintTime() {

  if [ "$1" = true ]
  then
    echo "$2 $3 seconds" >&2
  fi
}

PrintTime "$download" "DWN" "$DWN"

PrintTime "$e2index" "IDX" "$IDX"
PrintTime "$e2invert" "INV" "$INV"
PrintTime "$e2merge" "MRG" "$MRG"
PrintTime "$e2post" "PST" "$PST"

echo "" >&2

function PrintTotalElapsedTime {
  local L=$1
  local T=$2
  local D=$((T/60/60/24))
  local H=$((T/60/60%24))
  local M=$((T/60%60))
  local S=$((T%60))
  printf '%s %d second' "$L" $T 1>&2
  (( $T > 1 )) && printf 's' 1>&2
  if [ "$T" -gt 59 ]
  then
    printf ', or' 1>&2
    (( $D > 0 )) && printf ' %d day' $D 1>&2
    (( $D > 1 )) && printf 's' 1>&2
    (( $H > 0 )) && printf ' %d hour' $H 1>&2
    (( $H > 1 )) && printf 's' 1>&2
    (( $M > 0 )) && printf ' %d minute' $M 1>&2
    (( $M > 1 )) && printf 's' 1>&2
    (( $S > 0 )) && printf ' %d second' $S 1>&2
    (( $S > 1 )) && printf 's' 1>&2
  fi
  printf '\n' 1>&2
}

total_end=$(date "+%s")
total=$((total_end - total_start))
TOT=$total
PrintTotalElapsedTime "TOT" "$TOT"
echo "" >&2
