#!/bin/sh
":" //# -*- mode: js -*-; exec /usr/bin/env node --max-old-space-size=9000 --stack-size=42000 "$0" "$@"

"use strict";

/************************************/
/* MODULE VARIABLE SECTION **********/
/************************************/

const os = require('os')
const osProcess = require('process')
const osPath = require( 'path' )
const expandHomeDir = require( 'expand-home-dir' )

const fs = require( 'fs-extra' )
const mimeDb = require( 'mime-db' )
const mime = require( 'mime-types' )

const packageInfo = require('./package.json')
const genericPool = require( 'generic-pool' )
const asyncRead = require('promised-read').read
const cheerio = require('cheerio')
const command = require('commander')

const csvOutput = require('csv-stringify')

const moment = require("moment")
require("moment-duration-format")

const startTime = Date.now()

function elapsedStr( from , to = Date.now()) {
    return moment.duration( to - from ).format('d[d]hh:mm:ss.SSS',{ stopTrim: "h" })
}

function log ( ...args ) {
    console.log( elapsedStr( startTime ), ... args )
}

function warning ( ...args ) {
    log( ...args )
}

function fatal ( ...args ) {
    log( ...args )
    osProcess.exit( 1 )
}

//~ var lzma = require('lzma-native')
try {
  var lzma = require('xz')
} catch (er) {
    if ( os.type() == 'Windows_NT' ) {
        fatal( 'Module "xz" is not available on Windows' )
    } else {
        fatal( 'Module "xz" is required' )
    }
}
//~ var lzma = require('node-liblzma')

var srcPath;
var outPath;
var src; // input file reader

var articles = null;
var metadata = [];

function readUInt64LE(buf, offset) {
    var lowBits = buf.readUInt32LE(offset);
    var highBits = buf.readUInt32LE(offset + 4);
    return highBits * 0x100000000 + lowBits
};

function blobPath(clusterIdx, blobIdx) {
    return osPath.join(outPath, clusterIdx + '-' + blobIdx + '-blob');
}

function articlePath(article) {
    return osPath.join(outPath, article.url);
}

//
// class Reader
//
class Reader {
    constructor ( path ) {
        this.path = path;
        this.position = 0;
        this.file = fs.open( path, 'r' )

        this.queue = genericPool.createPool(
            {
                async create () { return Symbol() },
                async destroy ( resource ) { },
            },
            {}
        )
    }

    async read ( length, position ) {

        const token = await this.queue.acquire()
        const fd = await this.file

        if (typeof position !== 'number')
            position = this.position
        this.position = position + length

        const data = Buffer.alloc(length)
        const bytes = await fs.read( fd, data, 0, length, position )
        this.queue.release( token )
        return data
    }

    async close () {
        await this.queue.drain()
        const fd = await this.file
        await fs.close( fd )
    }

    tell () {
        return this.position
    }
}

var headerLength = 80;

var header = {
    magicNumber: 72173914,  //    integer     0   4   Magic number to recognise the file format, must be 72173914
    version: 5,             //    integer     4   4   ZIM=5, bytes 1-2: major, bytes 3-4: minor version of the ZIM file format
    uuid: 0,                //    integer     8   16  unique id of this zim file
    articleCount: 0,        //    integer     24  4   total number of articles
    clusterCount: 0,        //    integer     28  4   total number of clusters
    urlPtrPos: 0,           //    integer     32  8   position of the directory pointerlist ordered by URL
    titlePtrPos: 0,         //    integer     40  8   position of the directory pointerlist ordered by Title
    clusterPtrPos: 0,       //    integer     48  8   position of the cluster pointer list
    mimeListPos: headerLength, // integer     56  8   position of the MIME type list (also header size)
    mainPage: 0xffffffff,   //    integer     64  4   main page or 0xffffffff if no main page
    layoutPage: 0xffffffff, //    integer     68  4   layout page or 0xffffffffff if no layout page
    checksumPos: 0,         //    integer     72  8   pointer to the md5checksum of this file without the checksum itself. This points always 16 bytes before the end of the file.
    geoIndexPos: 0,         //    integer     80  8   pointer to the geo index (optional). Present if mimeListPos is at least 80.
};

async function readHeader ( ) {
    log('reading header')
    const buf = await src.read( headerLength, 0 )

    header.articleCount = buf.readUInt32LE(24);
    header.clusterCount = buf.readUInt32LE(28);

    header.urlPtrPos = readUInt64LE(buf, 32);
    header.titlePtrPos = readUInt64LE(buf, 40);
    header.clusterPtrPos = readUInt64LE(buf, 48);
    header.mimeListPos = readUInt64LE(buf, 56);

    header.mainPage = buf.readUInt32LE(64);
    header.layoutPage = buf.readUInt32LE(68);

    log('header', header);
}

async function processClusterList ( ) {
    log('reading ClusterPointers')
    const buf = await src.read( header.clusterCount * 8, header.clusterPtrPos )

    try {
        for ( let i=0; i < header.clusterCount; i++ ) {
            await processCluster( buf, i )
        }
    } catch ( err ) {
        fatal( 'processClusterList', err )
    }
};

async function processCluster( buf, clusterIdx ) {
    var eof = false;

    const clusterOfs = readUInt64LE( buf, clusterIdx * 8 )

    async function readCompression () {
        const buf = await src.read( 1, clusterOfs )

        return buf.readUInt8(0) & 4;  // xz compressed
    }

    async function getSource( isCompressed ) {
        var slice = fs.createReadStream(
            src.path,
            {
                start: clusterOfs + 1,
                // autoClose: false,
            }
        );

        slice.on('error', function (err) {
            console.error('processCluster', clusterIdx, 'input error', err);
            //~ process.exit(1);
        });

        slice.on('end', function () {
            log('processCluster', clusterIdx, 'input end');
            eof = true;
            //~ process.exit(1);
        });

        slice.on('close', function () {
            log('processCluster', clusterIdx, 'input closed');
            eof = true;
            //~ process.exit(1);
        });

        slice.on('open', function (fd) {
            log('processCluster', clusterIdx, 'input open', fd);
        });

        if ( isCompressed ) { // xz compressed
            const decompressed = new lzma.Decompressor()
            slice.pipe( decompressed )
            return decompressed
        }
        return slice
    }

    async function readOffsets ( input ) {
        const offsets = []
        let noffsets
        for ( var buf; buf = await asyncRead( input,  4 );) {
            var ofs = buf.readUInt32LE( 0 )
            if ( offsets.length == 0 ) {
                noffsets = ofs / 4
            }
            //~ log('readOffsets', clusterIdx, noffsets, offsets.length, ofs);
            offsets.push(ofs)

            if ( offsets.length == noffsets ) {
                //~ log('readOffsets done', clusterIdx, noffsets, offsets.length, ofs);
                return offsets
            }
        }
        fatal( 'readOffsets prematire stream end' )
    }

    async function dumpBlobs ( input, offsets ) {
        for ( let i=0; i < offsets.length-1; i++ ) {

            const blobLen = offsets[ i + 1 ] - offsets[ i ]
            const blob = blobLen === 0 ?
                Buffer.alloc(0)
                : await asyncRead( input, blobLen )
            await fs.outputFile( blobPath( clusterIdx, i ), blob )

            //~ log('readBlobs', clusterIdx, isCompressed, nblobs, i, blobLen)
        }

            //~ log('readBlobs done', clusterIdx, isCompressed, nblobs, blobIdx, blobLen)
    }

    let input

    try {
        const isCompressed = await readCompression()
        log('processCluster', clusterIdx, header.clusterCount, isCompressed);

        input = await getSource( isCompressed )
        const offsets = await readOffsets( input )
        await dumpBlobs( input, offsets )
    } catch ( err ) {
        if (!eof) {
            //~ slice.fd = null;
            input && input.destroy()
        }
        fatal( 'processCluster error', clusterIdx, header.clusterCount, err )
    }
}

async function getDirEntry ( article ) {
    let chunkLen = 512;
    let dirEntry

    function parseDirEntry () {
        article.mimeIdx = dirEntry.readUInt16LE(0);
        article.nameSpace = dirEntry.toString('utf8', 3, 4);

        var strOfs = 16;
        if (article.mimeIdx ==  0xfffe || article.mimeIdx ==  0xfffd) {
            // linktarget or deleted entry
            return true // noop
        } else if (article.mimeIdx ==  0xffff ) { //redirect
            strOfs = 12;
            article.redirectIndex = dirEntry.readUInt32LE(8);
        } else {
            article.clusterIdx = dirEntry.readUInt32LE(8);
            article.blobIdx = dirEntry.readUInt32LE(12);
        }

        // read url and title
        var end = dirEntry.indexOf(0, strOfs);
        if (end != -1) {
            article.url = dirEntry.toString('utf8', strOfs, end);

            var strOfs = end + 1;
            end = dirEntry.indexOf(0, strOfs);
            if (end != -1) {
                article.title = dirEntry.toString('utf8', strOfs, end);
            }
        }

        if (end == -1) // short buffer -- read more
            return false

        log('parseDirEntry', article.index, header.articleCount, '\n', article);

        articles[article.index] = article

        return true
    }

    try {
        while ( true ) {
            dirEntry = await src.read( chunkLen, article.offset )
            if ( parseDirEntry() )
                return article
            chunkLen *= 2
        }
    } catch ( err ) {
        fatal( 'processdirEntry read error', article.index, header.articleCount, err )
    }
}

async function renameBlob( article ) {

    var bpath = blobPath(article.clusterIdx, article.blobIdx)

    if (article.nameSpace == 'M') { // metadata
        const data = await fs.readFile ( bpath, 'utf8' )
        metadata.push([article.url.toLowerCase(), data])
        return fs.unlink( bpath )
    }
    const apath = articlePath( article )

    log('renameBlob', article.index, header.articleCount, bpath, '->', apath )

    return fs.move( bpath, apath, { clobber: true })
}

async function loadArticle( article ) {
    if (article.nameSpace != 'A')
        return null
    const data = await fs.readFile( articlePath( article ))

    try {
        const dom = cheerio.load( data )
        return dom
    } catch ( e ) {
        log( 'cheerio.load error', e, data )
        return null
    }
}

var nameSpaces = ['-', 'A', 'B', 'I', 'J', 'M', 'U', 'W', 'X'];

function alterLinks( article, dom ) {
    var nameSpaceLink = function (elem, attr) {
        let link
        try {
            link = url.parse(elem.attribs[attr], true, true)
        } catch (err) {
            //~ console.error('alterLinks error', err, article, attr, elem.attribs[attr], elem)
            console.error('alterLinks', err.message, elem.attribs[attr], 'at', article.path)
            return
        }
        if ( (link.protocol && link.protocol != 'http:' && link.protocol != 'https:')
                || link.host || ! link.pathname)
            return

        var chunks = link.pathname.split('/')

        if ( chunks[0] == '' // abs path
                || chunks[0] == '..'
                && nameSpaces.indexOf(chunks[1]) != -1) {
            chunks.shift();
            chunks.shift();
            link.pathname = chunks.join('/');
            //~ log('alterLinks', elem.attribs[attr], url.format(link));
            elem.attribs[attr] = url.format(link);
            return // OK
        }
        return
    }

    dom( '[src]' ).each( (i, elem) => nameSpaceLink( elem, 'src' ))
    dom( '[href]' ).each( (i, elem) => nameSpaceLink( elem, 'href' ))
}

async function processArticle ( articleIndex ) {
    if ( articles[ articleIndex ] != null )
        return true // already processed

    const article = {
        index: articleIndex,
        offset: readUInt64LE( rawDirectory, articleIndex * 8 )
    }

    await getDirEntry( article )

    if ( article.mimeIdx ==  0xfffe || article.mimeIdx ==  0xfffd ) {
        // linktarget or deleted entry
        return true // noop
    }
    if ( article.mimeIdx ==  0xffff ) { //redirect
        return storeRedirect( article )
    }

    const moved = await renameBlob( article )
    if (! moved )
        return null
    const dom = await loadArticle( article )
    if (! dom )
        return null
    await alterLinks( article, dom )
    return fs.outputFile( articlePath( article ), Buffer.from( dom.html() ))
}

var rawDirectory

async function processArticleList () {
    log('reading ArticleList')
    articles = Array( header.articleCount )
    rawDirectory = await src.read(header.articleCount * 8, header.urlPtrPos )

    //~ log( 'articleOffsets', articleOffsets);

    for ( let i=0; i < header.articleCount; i++ ) {
        await processArticle( i )
    }
    log( '*** articles' )
    articles.forEach( (val, i ) => log( i, val.nameSpace, val.url ))

    if ( redirectOut )
        return new Promise( ( resolve, reject ) => {
            redirectOut.end( resolve )
        })
}

async function processTitleList () {
    log('reading Title List')
    const titleDirectory = await src.read( header.articleCount * 4, header.titlePtrPos )

    //~ log( 'articleOffsets', articleOffsets);
    log( '*** titles' )

    for ( let i=0; i < header.articleCount; i++ ) {
        const idx = titleDirectory.readUInt32LE( i * 4 )
        log( i, idx, articles[ idx ].nameSpace, articles[ idx ].title, '>', articles[ idx ].url )
    }
}

var redirectOut = null

function storeRedirect ( article ) {
    log('storeRedirect', article)

    if (article.nameSpace == '-' && (article.url == 'favicon' || article.url == 'mainPage'))
        return

    if (! redirectOut) {
        redirectOut = csvOutput({delimiter: '\t'})
        redirectOut.pipe(fs.createWriteStream(osPath.join(outPath, '..', 'redirects.csv')))
    }

    var target = articles[ article.redirectIndex ]
    if (! target) { // fetch target artcile isn't yet processed
        return processArticle( article.redirectIndex )
            .then(() => storeRedirect( article ))
    }

    var item = [ article.nameSpace, article.url, article.title, target.url ]

    log('storeRedirect', item)

    return new Promise(( resolve, reject ) => {
        var write = function () {
            try {
                if (! redirectOut.write(item))
                    return redirectOut.once('drain', write)
                resolve( false )
            } catch ( err ) {
                reject( err )
            }
        }
        write()
    })
}

function storeMetadata () {
    log('storeMetadata');
    if ( metadata.length == 0 )
        return

    var csv = csvOutput({ delimiter: ' ' })
    csv.pipe( fs.createWriteStream( osPath.join( outPath, '..', 'metadata.csv' )))

    return new Promise(( resolve, reject ) => {
        var write = function () {
            try {
                var i = 0;
                var write = function () {
                    while (true) {
                        if ( i == metadata.length ) {
                            log('storeMetadata finished');
                            return csv.end( resolve );
                        }
                        var item = metadata[i];
                        log('storeMetadata', metadata.length, i, item);
                        if (! csv.write( item ))
                            break;
                        i++
                    }
                    csv.once( 'drain', write )
                }
            } catch ( err ) {
                reject( err )
            }
        }
        write()
    })
}

async function core () {
    src = new Reader(srcPath)

    await readHeader( )
    await processClusterList()
    await processArticleList()
    await processTitleList()
    await storeMetadata()

    await src.close()
}

function main () {
    command
    .version( packageInfo.version )
    .arguments( '<wiki-page-URL>' )
    .description( 'Dumps a ZIM file' )
    .option( '-h -help' )
    .parse( process.argv )

    log( command.opts() )

    srcPath = expandHomeDir( command.args[0] )
    outPath = expandHomeDir( command.args[1] )
    if (! outPath ) {
        var parsed = osPath.parse(srcPath)
        outPath = parsed.name
    }

    core()
}

main ()