#!/bin/sh

set -e

if [ "$#" -lt 3 ]; then
    echo "Usage: ./libpostal_data [upload|download] [base|parser|language_classifier|all] data_dir"
    exit 1
fi

LIBPOSTAL_VERSION_STRING="v1"
LIBPOSTAL_RELEASE_VERSION_STRING="v1.0.0"

LIBPOSTAL_REPO_NAME="openvenues/libpostal"
LIBPOSTAL_S3_BUCKET_NAME="libpostal"
LIBPOSTAL_S3_KEY="s3://$LIBPOSTAL_S3_BUCKET_NAME"

GITHUB_API_URL="https://api.github.com"
LIBPOSTAL_RELEASE_API_URL="$GITHUB_API_URL/repos/$LIBPOSTAL_REPO_NAME/releases"

LIBPOSTAL_DATA_FILE="libpostal_data.tar.gz"
LIBPOSTAL_PARSER_FILE="parser.tar.gz"
LIBPOSTAL_LANG_CLASS_FILE="language_classifier.tar.gz"

LIBPOSTAL_DATA_S3_PREFIX="$LIBPOSTAL_LATEST_MODEL_VERSION_STRING/parser.tar.gz"
LIBPOSTAL_PARSER_S3_PREFIX="$LIBPOSTAL_LATEST_DATA_VERSION_STRING/libpostal_data.tar.gz"
LIBPOSTAL_LANG_CLASS_S3_PREFIX="$LIBPOSTAL_LATEST_MODEL_VERSION_STRING/language_classifier.tar.gz"

COMMAND=$1
FILE=$2
LIBPOSTAL_DATA_DIR=$3

LIBPOSTAL_DATA_VERSION_FILE=$LIBPOSTAL_DATA_DIR/data_version
LIBPOSTAL_DATA_DIR_VERSION=

mkdir -p $LIBPOSTAL_DATA_DIR

LIBPOSTAL_DATA_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated
LIBPOSTAL_PARSER_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated_parser
LIBPOSTAL_LANG_CLASS_UPDATED_PATH=$LIBPOSTAL_DATA_DIR/last_updated_language_classifier

BASIC_MODULE_DIRS="address_expansions numex transliteration"
PARSER_MODULE_DIR=address_parser
LANGUAGE_CLASSIFIER_MODULE_DIR=language_classifier

export LC_ALL=C

EPOCH_DATE="Jan  1 00:00:00 1970"

MB=$((1024*1024))
CHUNK_SIZE=$((64*$MB))

LARGE_FILE_SIZE=$((CHUNK_SIZE*2))


NUM_WORKERS=12

kill_background_processes() {
    jobs -p | xargs kill;
    exit
}

trap kill_background_processes INT

PART_MSG='echo "Downloading part $1: filename=$5, offset=$2, max=$3"'
PART_CURL='curl -L $4 --silent -H"Range:bytes=$2-$3" --retry 3 --retry-delay 2 -o $5'
DOWNLOAD_PART="$PART_MSG;$PART_CURL"


download_multipart() {
    url=$1
    filename=$2
    size=$3

    num_chunks=$((size/CHUNK_SIZE))
    echo "Downloading multipart: $url, size=$size, num_chunks=$num_chunks"
    offset=0
    i=0
    while [ $i -lt $num_chunks ]; do
        i=$((i+1))
        part_filename="$filename.$i"
        if [ $i -lt $num_chunks ]; then
            max=$((offset+CHUNK_SIZE-1));
        else
            max=$size;
        fi;
        printf "%s\0%s\0%s\0%s\0%s\0" "$i" "$offset" "$max" "$url" "$part_filename"
        offset=$((offset+CHUNK_SIZE))
    done | xargs -0 -n 5 -P $NUM_WORKERS sh -c "$DOWNLOAD_PART" --

    > $local_path

    i=0
    while [ $i -lt $num_chunks ]; do
        i=$((i+1))
        part_filename="$filename.$i"
        cat $part_filename >> $local_path
        rm $part_filename
    done;

}


download_file() {
    updated_path=$1
    data_dir=$2
    metadata_url=$3
    url=$4
    size=$5
    filename=$6
    name=$7
    shift 7
    subdirs=$@

    local_path=$data_dir/$filename

    if [ ! -e $updated_path ]; then
        echo "$EPOCH_DATE" > $updated_path;
    fi;

    echo "Checking for new libpostal $name..."

    if [ $(curl -LsI $metadata_url -z "$(cat $updated_path)" --remote-time -w %{http_code} -o /dev/null | grep "^200$") ]; then
        echo "New libpostal $name available"

        if [ $size -ge $LARGE_FILE_SIZE ]; then
            download_multipart $url $local_path $size
        else
            curl -L $url --retry 3 --retry-delay 2 -o $local_path
        fi

        if date -ud "@$(date -ur . +%s)" >/dev/null 2>&1; then
            echo $(date -ud "$(date -ud "@$(date -ur $local_path +%s)") + 1 second") > $updated_path;
        elif stat -f %Sm . >/dev/null 2>&1; then
            echo $(date -ur $(stat -f %m $local_path) -v+1S) > $updated_path;
        fi;
        for subdir in $subdirs; do
            rm -rf $data_dir/$subdir;
        done
        tar -xvzf $local_path --no-same-owner -C $data_dir;
        rm $local_path;
    else
        echo "libpostal $name up to date"
    fi
}

if [ $COMMAND = "download" ]; then
    if [ -e $LIBPOSTAL_DATA_VERSION_FILE ]; then
        LIBPOSTAL_DATA_DIR_VERSION=$(cat $LIBPOSTAL_DATA_VERSION_FILE)
    fi

    if [ "$LIBPOSTAL_DATA_DIR_VERSION" != "$LIBPOSTAL_VERSION_STRING" ]; then
        echo "Old version of datadir detected, removing..."
        for subdir in $BASIC_MODULE_DIRS $PARSER_MODULE_DIR $LANGUAGE_CLASSIFIER_MODULE_DIR; do
            rm -rf $LIBPOSTAL_DATA_DIR/$subdir;
        done

        # Legacy, blow it away too to be nice
        if [ -e $LIBPOSTAL_DATA_DIR/geodb ]; then
            rm -rf $LIBPOSTAL_DATA_DIR/geodb;
        fi

        rm -f $LIBPOSTAL_DATA_UPDATED_PATH
        rm -f $LIBPOSTAL_LANG_CLASS_UPDATED_PATH
        rm -f $LIBPOSTAL_PARSER_UPDATED_PATH
    fi

    mkdir -p $LIBPOSTAL_DATA_DIR

    release_id=$(curl -s $LIBPOSTAL_RELEASE_API_URL/tags/$LIBPOSTAL_RELEASE_VERSION_STRING | grep "\"id\"" | head -n1 | grep -o '[0-9][0-9]*')
    release_assets="$(curl -s $LIBPOSTAL_RELEASE_API_URL/$release_id/assets)"

    asset_names_tempfile="$LIBPOSTAL_DATA_DIR/asset_names.tmp"
    echo "$release_assets" | grep -o '"name": *"[^"]*"' | grep -o '"[^"]*"$' | tr -d '"' > $asset_names_tempfile
    asset_metadata_tempfile="$LIBPOSTAL_DATA_DIR/asset_metadata.tmp"
    echo "$release_assets" | grep -o '"url": *"[^"]*/releases/assets/[0-9]*"' | grep -o '"[^"]*"$' | tr -d '"' > $asset_metadata_tempfile
    asset_urls_tempfile="$LIBPOSTAL_DATA_DIR/asset_urls.tmp"
    echo "$release_assets" | grep -o '"browser_download_url": *"[^"]*"' | grep -o '"[^"]*"$' | tr -d '"' > $asset_urls_tempfile
    asset_sizes_tempfile="$LIBPOSTAL_DATA_DIR/asset_sizes.tmp"
    echo "$release_assets" | grep -o '"size": *[0-9]*' | grep -o '[0-9]*$' > $asset_sizes_tempfile

    assets_tempfile="$LIBPOSTAL_DATA_DIR/assets.tmp"
    paste -d' ' $asset_names_tempfile $asset_metadata_tempfile $asset_urls_tempfile $asset_sizes_tempfile > $assets_tempfile

    rm $asset_names_tempfile $asset_metadata_tempfile $asset_urls_tempfile $asset_sizes_tempfile

    while read -r line; do
        asset=$(echo "$line" | cut -f1 -d' ')
        asset_metadata_url=$(echo "$line" | cut -f2 -d' ')
        asset_url=$(echo "$line" | cut -f3 -d' ')
        asset_size=$(echo "$line" | cut -f4 -d' ')

        if [ $asset = $LIBPOSTAL_DATA_FILE ] && ([ $FILE = "base" ] || [ $FILE = "all" ]); then
            download_file $LIBPOSTAL_DATA_UPDATED_PATH $LIBPOSTAL_DATA_DIR $asset_metadata_url $asset_url $asset_size $LIBPOSTAL_DATA_FILE "data file" $BASIC_MODULE_DIRS
        fi
        if [ $asset = $LIBPOSTAL_PARSER_FILE ] && ([ $FILE = "parser" ] || [ $FILE = "all" ]); then
            download_file $LIBPOSTAL_PARSER_UPDATED_PATH $LIBPOSTAL_DATA_DIR $asset_metadata_url $asset_url $asset_size $LIBPOSTAL_PARSER_FILE "parser data file" $PARSER_MODULE_DIR
        fi
        if [ $asset = $LIBPOSTAL_LANG_CLASS_FILE ] && ([ $FILE = "language_classifier" ] || [ $FILE = "all" ]); then
            download_file $LIBPOSTAL_LANG_CLASS_UPDATED_PATH $LIBPOSTAL_DATA_DIR $asset_metadata_url $asset_url $asset_size $LIBPOSTAL_LANG_CLASS_FILE "language classifier data file" $LANGUAGE_CLASSIFIER_MODULE_DIR
        fi

        if [ "$LIBPOSTAL_DATA_DIR_VERSION" != "$LIBPOSTAL_VERSION_STRING" ]; then
            echo $LIBPOSTAL_VERSION_STRING > $LIBPOSTAL_DATA_VERSION_FILE;
        fi
    done < $assets_tempfile;
    rm $assets_tempfile

elif [ $COMMAND = "upload" ]; then
    echo "upload not implemented yet"

    #if [ $FILE = "base" ] || [ $FILE = "all" ]; then
    #    tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $BASIC_MODULE_DIRS
    #    aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_DATA_FILE $LIBPOSTAL_S3_KEY/$LIBPOSTAL_DATA_S3_PREFIX/
    #fi

    #if [ $FILE = "parser" ] || [ $FILE = "all" ]; then
    #    latest_parser=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_PARSER_S3_PREFIX/latest)
    #    tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $PARSER_MODULE_DIR
    #    parser_s3_dir="$LIBPOSTAL_S3_KEY/$LIBPOSTAL_PARSER_S3_PREFIX/$latest_parser/"
    #    aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_PARSER_FILE $parser_s3_dir
    #fi

    #if [ $FILE = "language_classifier" ] || [ $FILE = "all" ]; then
    #    latest_lang_class=$(curl --silent $LIBPOSTAL_S3_BUCKET_URL/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/latest)
    #    tar -C $LIBPOSTAL_DATA_DIR -cvzf $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $LANGUAGE_CLASSIFIER_MODULE_DIR
    #    lang_class_s3_dir="$LIBPOSTAL_S3_KEY/$LIBPOSTAL_LANG_CLASS_S3_PREFIX/$latest_lang_class/"
    #    aws s3 cp --acl=public-read $LIBPOSTAL_DATA_DIR/$LIBPOSTAL_LANG_CLASS_FILE $lang_class_s3_dir
    #fi
else
    echo "Invalid command: $COMMAND"
    exit 1
fi
