unzimmer.js

2018-11-21 19:36:58 +03:00 · 2018-11-21 19:36:58 +03:00 · 4b842a8b41
commit 4b842a8b41
parent 08a1b6ca5c
2 changed files with 557 additions and 5 deletions
--- a/package.json
+++ b/package.json
@ -19,6 +19,7 @@
    "child-process": "*",
    "commander": "^2.11.0",
    "csv-parse": "*",
+    "csv-stringify": "^4.3.1",
    "encodeurl": "^1.0.1",
    "expand-home-dir": "*",
    "fs-extra": "^3.0.1",
@ -33,17 +34,18 @@
    "moment-duration-format": "^2.2.2",
    "mozjpeg": "*",
    "mz": "^2.6.0",
+    "promised-read": "^2.0.1",
    "quick-lru": "^1.0.0",
-    "request": "^2.81.0",
-    "request-promise": "^4.2.1",
+    "request-promise": "*",
    "sanitize-filename": "^1.6.1",
    "sharp": "^0.17.3",
    "sqlite": "^2.8.0",
    "sqlite3": "*",
-    "uuid": "*"
+    "uuid": "*",
+    "xz": "^1.3.0"
  },
-  "engines" : {
-    "node" : ">=8.0.0"
+  "engines": {
+    "node": ">=8.0.0"
  },
  "bin": {
    "zimmer": "./zimmer.js",
--- a/unzimmer.js
+++ b/unzimmer.js
@ -0,0 +1,550 @@
+#!/bin/sh
+":" //# -*- mode: js -*-; exec /usr/bin/env node --max-old-space-size=9000 --stack-size=42000 "$0" "$@"
+
+"use strict";
+
+/************************************/
+/* MODULE VARIABLE SECTION **********/
+/************************************/
+
+const fs = require( 'fs-extra' )
+var mimeDb = require( 'mime-db' );
+var mime = require( 'mime-types' );
+
+const packageInfo = require('./package.json');
+const genericPool = require( 'generic-pool' )
+const asyncRead = require('promised-read').read
+const cheerio = require('cheerio')
+const command = require('commander')
+
+const osProcess = require('process')
+var osPath = require( 'path' );
+var expandHomeDir = require( 'expand-home-dir' );
+//~ var lzma = require('lzma-native');
+var lzma = require('xz');
+//~ var lzma = require('node-liblzma');
+var csvOutput = require('csv-stringify');
+
+const moment = require("moment")
+require("moment-duration-format")
+
+var srcPath;
+var outPath;
+var src; // input file reader
+
+var articles = null;
+var metadata = [];
+
+const startTime = Date.now()
+
+function elapsedStr( from , to = Date.now()) {
+    return moment.duration( to - from ).format('d[d]hh:mm:ss.SSS',{ stopTrim: "h" })
+}
+
+function log ( ...args ) {
+    console.log( elapsedStr( startTime ), ... args )
+}
+
+function warning ( ...args ) {
+    log( ...args )
+}
+
+function fatal ( ...args ) {
+    log( ...args )
+    osProcess.exit( 1 )
+}
+
+function readUInt64LE(buf, offset) {
+    var lowBits = buf.readUInt32LE(offset);
+    var highBits = buf.readUInt32LE(offset + 4);
+    return highBits * 0x100000000 + lowBits
+};
+
+function blobPath(clusterIdx, blobIdx) {
+    return osPath.join(outPath, clusterIdx + '-' + blobIdx + '-blob');
+}
+
+function articlePath(article) {
+    return osPath.join(outPath, article.url);
+}
+
+//
+// class Reader
+//
+class Reader {
+    constructor ( path ) {
+        this.path = path;
+        this.position = 0;
+        this.file = fs.open( path, 'r' )
+
+        this.queue = genericPool.createPool(
+            {
+                async create () { return Symbol() },
+                async destroy ( resource ) { },
+            },
+            {}
+        )
+    }
+
+    async read ( length, position ) {
+
+        const token = await this.queue.acquire()
+        const fd = await this.file
+
+        if (typeof position !== 'number')
+            position = this.position
+        this.position = position + length
+
+        const data = Buffer.alloc(length)
+        const bytes = await fs.read( fd, data, 0, length, position )
+        this.queue.release( token )
+        return data
+    }
+
+    async close () {
+        await this.queue.drain()
+        const fd = await this.file
+        await fs.close( fd )
+    }
+
+    tell () {
+        return this.position
+    }
+}
+
+var headerLength = 80;
+
+var header = {
+    magicNumber: 72173914,  //    integer     0   4   Magic number to recognise the file format, must be 72173914
+    version: 5,             //    integer     4   4   ZIM=5, bytes 1-2: major, bytes 3-4: minor version of the ZIM file format
+    uuid: 0,                //    integer     8   16  unique id of this zim file
+    articleCount: 0,        //    integer     24  4   total number of articles
+    clusterCount: 0,        //    integer     28  4   total number of clusters
+    urlPtrPos: 0,           //    integer     32  8   position of the directory pointerlist ordered by URL
+    titlePtrPos: 0,         //    integer     40  8   position of the directory pointerlist ordered by Title
+    clusterPtrPos: 0,       //    integer     48  8   position of the cluster pointer list
+    mimeListPos: headerLength, // integer     56  8   position of the MIME type list (also header size)
+    mainPage: 0xffffffff,   //    integer     64  4   main page or 0xffffffff if no main page
+    layoutPage: 0xffffffff, //    integer     68  4   layout page or 0xffffffffff if no layout page
+    checksumPos: 0,         //    integer     72  8   pointer to the md5checksum of this file without the checksum itself. This points always 16 bytes before the end of the file.
+    geoIndexPos: 0,         //    integer     80  8   pointer to the geo index (optional). Present if mimeListPos is at least 80.
+};
+
+async function readHeader ( ) {
+    log('reading header')
+    const buf = await src.read( headerLength, 0 )
+
+    header.articleCount = buf.readUInt32LE(24);
+    header.clusterCount = buf.readUInt32LE(28);
+
+    header.urlPtrPos = readUInt64LE(buf, 32);
+    header.titlePtrPos = readUInt64LE(buf, 40);
+    header.clusterPtrPos = readUInt64LE(buf, 48);
+    header.mimeListPos = readUInt64LE(buf, 56);
+
+    header.mainPage = buf.readUInt32LE(64);
+    header.layoutPage = buf.readUInt32LE(68);
+
+    log('header', header);
+}
+
+async function processClusterList ( ) {
+    log('reading ClusterPointers')
+    const buf = await src.read( header.clusterCount * 8, header.clusterPtrPos )
+
+    try {
+        for ( let i=0; i < header.clusterCount; i++ ) {
+            await processCluster( buf, i )
+        }
+    } catch ( err ) {
+        fatal( 'processClusterList', err )
+    }
+};
+
+async function processCluster( buf, clusterIdx ) {
+    var eof = false;
+
+    const clusterOfs = readUInt64LE( buf, clusterIdx * 8 )
+
+    async function readCompression () {
+        const buf = await src.read( 1, clusterOfs )
+
+        return buf.readUInt8(0) & 4;  // xz compressed
+    }
+
+    async function getSource( isCompressed ) {
+        var slice = fs.createReadStream(
+            src.path,
+            {
+                start: clusterOfs + 1,
+                // autoClose: false,
+            }
+        );
+
+        slice.on('error', function (err) {
+            console.error('processCluster', clusterIdx, 'input error', err);
+            //~ process.exit(1);
+        });
+
+        slice.on('end', function () {
+            log('processCluster', clusterIdx, 'input end');
+            eof = true;
+            //~ process.exit(1);
+        });
+
+        slice.on('close', function () {
+            log('processCluster', clusterIdx, 'input closed');
+            eof = true;
+            //~ process.exit(1);
+        });
+
+        slice.on('open', function (fd) {
+            log('processCluster', clusterIdx, 'input open', fd);
+        });
+
+        if ( isCompressed ) { // xz compressed
+            const decompressed = new lzma.Decompressor()
+            slice.pipe( decompressed )
+            return decompressed
+        }
+        return slice
+    }
+
+    async function readOffsets ( input ) {
+        const offsets = []
+        let noffsets
+        for ( var buf; buf = await asyncRead( input,  4 );) {
+            var ofs = buf.readUInt32LE( 0 )
+            if ( offsets.length == 0 ) {
+                noffsets = ofs / 4
+            }
+            //~ log('readOffsets', clusterIdx, noffsets, offsets.length, ofs);
+            offsets.push(ofs)
+
+            if ( offsets.length == noffsets ) {
+                //~ log('readOffsets done', clusterIdx, noffsets, offsets.length, ofs);
+                return offsets
+            }
+        }
+        fatal( 'readOffsets prematire stream end' )
+    }
+
+    async function dumpBlobs ( input, offsets ) {
+        for ( let i=0; i < offsets.length-1; i++ ) {
+
+            const blobLen = offsets[ i + 1 ] - offsets[ i ]
+            const blob = blobLen === 0 ?
+                Buffer.alloc(0)
+                : await asyncRead( input, blobLen )
+            await fs.outputFile( blobPath( clusterIdx, i ), blob )
+
+            //~ log('readBlobs', clusterIdx, isCompressed, nblobs, i, blobLen)
+        }
+
+            //~ log('readBlobs done', clusterIdx, isCompressed, nblobs, blobIdx, blobLen)
+    }
+
+    let input
+
+    try {
+        const isCompressed = await readCompression()
+        log('processCluster', clusterIdx, header.clusterCount, isCompressed);
+
+        input = await getSource( isCompressed )
+        const offsets = await readOffsets( input )
+        await dumpBlobs( input, offsets )
+    } catch ( err ) {
+        if (!eof) {
+            //~ slice.fd = null;
+            input && input.destroy()
+        }
+        fatal( 'processCluster error', clusterIdx, header.clusterCount, err )
+    }
+}
+
+async function getDirEntry ( article ) {
+    let chunkLen = 512;
+    let dirEntry
+
+    function parseDirEntry () {
+        article.mimeIdx = dirEntry.readUInt16LE(0);
+        article.nameSpace = dirEntry.toString('utf8', 3, 4);
+
+        var strOfs = 16;
+        if (article.mimeIdx ==  0xfffe || article.mimeIdx ==  0xfffd) {
+            // linktarget or deleted entry
+            return true // noop
+        } else if (article.mimeIdx ==  0xffff ) { //redirect
+            strOfs = 12;
+            article.redirectIndex = dirEntry.readUInt32LE(8);
+        } else {
+            article.clusterIdx = dirEntry.readUInt32LE(8);
+            article.blobIdx = dirEntry.readUInt32LE(12);
+        }
+
+        // read url and title
+        var end = dirEntry.indexOf(0, strOfs);
+        if (end != -1) {
+            article.url = dirEntry.toString('utf8', strOfs, end);
+
+            var strOfs = end + 1;
+            end = dirEntry.indexOf(0, strOfs);
+            if (end != -1) {
+                article.title = dirEntry.toString('utf8', strOfs, end);
+            }
+        }
+
+        if (end == -1) // short buffer -- read more
+            return false
+
+        log('parseDirEntry', article.index, header.articleCount, '\n', article);
+
+        articles[article.index] = article
+
+        return true
+    }
+
+    try {
+        while ( true ) {
+            dirEntry = await src.read( chunkLen, article.offset )
+            if ( parseDirEntry() )
+                return article
+            chunkLen *= 2
+        }
+    } catch ( err ) {
+        fatal( 'processdirEntry read error', article.index, header.articleCount, err )
+    }
+}
+
+async function renameBlob( article ) {
+
+    var bpath = blobPath(article.clusterIdx, article.blobIdx)
+
+    if (article.nameSpace == 'M') { // metadata
+        const data = await fs.readFile ( bpath, 'utf8' )
+        metadata.push([article.url.toLowerCase(), data])
+        return fs.unlink( bpath )
+    }
+    const apath = articlePath( article )
+
+    log('renameBlob', article.index, header.articleCount, bpath, '->', apath )
+
+    return fs.move( bpath, apath, { clobber: true })
+}
+
+async function loadArticle( article ) {
+    if (article.nameSpace != 'A')
+        return null
+    const data = await fs.readFile( articlePath( article ))
+
+    try {
+        const dom = cheerio.load( data )
+        return dom
+    } catch ( e ) {
+        log( 'cheerio.load error', e, data )
+        return null
+    }
+}
+
+var nameSpaces = ['-', 'A', 'B', 'I', 'J', 'M', 'U', 'W', 'X'];
+
+function alterLinks( article, dom ) {
+    var nameSpaceLink = function (elem, attr) {
+        let link
+        try {
+            link = url.parse(elem.attribs[attr], true, true)
+        } catch (err) {
+            //~ console.error('alterLinks error', err, article, attr, elem.attribs[attr], elem)
+            console.error('alterLinks', err.message, elem.attribs[attr], 'at', article.path)
+            return
+        }
+        if ( (link.protocol && link.protocol != 'http:' && link.protocol != 'https:')
+                || link.host || ! link.pathname)
+            return
+
+        var chunks = link.pathname.split('/')
+
+        if ( chunks[0] == '' // abs path
+                || chunks[0] == '..'
+                && nameSpaces.indexOf(chunks[1]) != -1) {
+            chunks.shift();
+            chunks.shift();
+            link.pathname = chunks.join('/');
+            //~ log('alterLinks', elem.attribs[attr], url.format(link));
+            elem.attribs[attr] = url.format(link);
+            return // OK
+        }
+        return
+    }
+
+    dom( '[src]' ).each( (i, elem) => nameSpaceLink( elem, 'src' ))
+    dom( '[href]' ).each( (i, elem) => nameSpaceLink( elem, 'href' ))
+}
+
+async function processArticle ( articleIndex ) {
+    if ( articles[ articleIndex ] != null )
+        return true // already processed
+
+    const article = {
+        index: articleIndex,
+        offset: readUInt64LE( rawDirectory, articleIndex * 8 )
+    }
+
+    await getDirEntry( article )
+
+    if ( article.mimeIdx ==  0xfffe || article.mimeIdx ==  0xfffd ) {
+        // linktarget or deleted entry
+        return true // noop
+    }
+    if ( article.mimeIdx ==  0xffff ) { //redirect
+        return storeRedirect( article )
+    }
+
+    const moved = await renameBlob( article )
+    if (! moved )
+        return null
+    const dom = await loadArticle( article )
+    if (! dom )
+        return null
+    await alterLinks( article, dom )
+    return fs.outputFile( articlePath( article ), Buffer.from( dom.html() ))
+}
+
+var rawDirectory
+
+async function processArticleList () {
+    log('reading ArticleList')
+    articles = Array( header.articleCount )
+    rawDirectory = await src.read(header.articleCount * 8, header.urlPtrPos )
+
+    //~ log( 'articleOffsets', articleOffsets);
+
+    for ( let i=0; i < header.articleCount; i++ ) {
+        await processArticle( i )
+    }
+    log( '*** articles' )
+    articles.forEach( (val, i ) => log( i, val.nameSpace, val.url ))
+
+    if ( redirectOut )
+        return new Promise( ( resolve, reject ) => {
+            redirectOut.end( resolve )
+        })
+}
+
+async function processTitleList () {
+    log('reading Title List')
+    const titleDirectory = await src.read( header.articleCount * 4, header.titlePtrPos )
+
+    //~ log( 'articleOffsets', articleOffsets);
+    log( '*** titles' )
+
+    for ( let i=0; i < header.articleCount; i++ ) {
+        const idx = titleDirectory.readUInt32LE( i * 4 )
+        log( i, idx, articles[ idx ].nameSpace, articles[ idx ].title, '>', articles[ idx ].url )
+    }
+}
+
+var redirectOut = null
+
+function storeRedirect ( article ) {
+    log('storeRedirect', article)
+
+    if (article.nameSpace == '-' && (article.url == 'favicon' || article.url == 'mainPage'))
+        return
+
+    if (! redirectOut) {
+        redirectOut = csvOutput({delimiter: '\t'})
+        redirectOut.pipe(fs.createWriteStream(osPath.join(outPath, '..', 'redirects.csv')))
+    }
+
+    var target = articles[ article.redirectIndex ]
+    if (! target) { // fetch target artcile isn't yet processed
+        return processArticle( article.redirectIndex )
+            .then(() => storeRedirect( article ))
+    }
+
+    var item = [ article.nameSpace, article.url, article.title, target.url ]
+
+    log('storeRedirect', item)
+
+    return new Promise(( resolve, reject ) => {
+        var write = function () {
+            try {
+                if (! redirectOut.write(item))
+                    return redirectOut.once('drain', write)
+                resolve( false )
+            } catch ( err ) {
+                reject( err )
+            }
+        }
+        write()
+    })
+}
+
+function storeMetadata () {
+    log('storeMetadata');
+    if ( metadata.length == 0 )
+        return
+
+    var csv = csvOutput({ delimiter: ' ' })
+    csv.pipe( fs.createWriteStream( osPath.join( outPath, '..', 'metadata.csv' )))
+
+    return new Promise(( resolve, reject ) => {
+        var write = function () {
+            try {
+                var i = 0;
+                var write = function () {
+                    while (true) {
+                        if ( i == metadata.length ) {
+                            log('storeMetadata finished');
+                            return csv.end( resolve );
+                        }
+                        var item = metadata[i];
+                        log('storeMetadata', metadata.length, i, item);
+                        if (! csv.write( item ))
+                            break;
+                        i++
+                    }
+                    csv.once( 'drain', write )
+                }
+            } catch ( err ) {
+                reject( err )
+            }
+        }
+        write()
+    })
+}
+
+async function core () {
+    src = new Reader(srcPath)
+
+    await readHeader( )
+    await processClusterList()
+    await processArticleList()
+    await processTitleList()
+    await storeMetadata()
+
+    await src.close()
+}
+
+function main () {
+    command
+    .version( packageInfo.version )
+    .arguments( '<wiki-page-URL>' )
+    .description( 'Dumps a ZIM file' )
+    .option( '-h -help' )
+    .parse( process.argv )
+
+    log( command.opts() )
+
+    srcPath = expandHomeDir( command.args[0] )
+    outPath = expandHomeDir( command.args[1] )
+    if (! outPath ) {
+        var parsed = osPath.parse(srcPath)
+        outPath = parsed.name
+    }
+
+    core()
+}
+
+main ()