From 4b842a8b4106ab96b0b057cb296ffdbd59b61649 Mon Sep 17 00:00:00 2001 From: v Date: Wed, 21 Nov 2018 19:36:58 +0300 Subject: [PATCH] unzimmer.js --- package.json | 12 +- unzimmer.js | 550 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 557 insertions(+), 5 deletions(-) create mode 100755 unzimmer.js diff --git a/package.json b/package.json index 8d2575a..3aafda6 100644 --- a/package.json +++ b/package.json @@ -19,6 +19,7 @@ "child-process": "*", "commander": "^2.11.0", "csv-parse": "*", + "csv-stringify": "^4.3.1", "encodeurl": "^1.0.1", "expand-home-dir": "*", "fs-extra": "^3.0.1", @@ -33,17 +34,18 @@ "moment-duration-format": "^2.2.2", "mozjpeg": "*", "mz": "^2.6.0", + "promised-read": "^2.0.1", "quick-lru": "^1.0.0", - "request": "^2.81.0", - "request-promise": "^4.2.1", + "request-promise": "*", "sanitize-filename": "^1.6.1", "sharp": "^0.17.3", "sqlite": "^2.8.0", "sqlite3": "*", - "uuid": "*" + "uuid": "*", + "xz": "^1.3.0" }, - "engines" : { - "node" : ">=8.0.0" + "engines": { + "node": ">=8.0.0" }, "bin": { "zimmer": "./zimmer.js", diff --git a/unzimmer.js b/unzimmer.js new file mode 100755 index 0000000..1ed9422 --- /dev/null +++ b/unzimmer.js @@ -0,0 +1,550 @@ +#!/bin/sh +":" //# -*- mode: js -*-; exec /usr/bin/env node --max-old-space-size=9000 --stack-size=42000 "$0" "$@" + +"use strict"; + +/************************************/ +/* MODULE VARIABLE SECTION **********/ +/************************************/ + +const fs = require( 'fs-extra' ) +var mimeDb = require( 'mime-db' ); +var mime = require( 'mime-types' ); + +const packageInfo = require('./package.json'); +const genericPool = require( 'generic-pool' ) +const asyncRead = require('promised-read').read +const cheerio = require('cheerio') +const command = require('commander') + +const osProcess = require('process') +var osPath = require( 'path' ); +var expandHomeDir = require( 'expand-home-dir' ); +//~ var lzma = require('lzma-native'); +var lzma = require('xz'); +//~ var lzma = require('node-liblzma'); +var csvOutput = require('csv-stringify'); + +const moment = require("moment") +require("moment-duration-format") + +var srcPath; +var outPath; +var src; // input file reader + +var articles = null; +var metadata = []; + +const startTime = Date.now() + +function elapsedStr( from , to = Date.now()) { + return moment.duration( to - from ).format('d[d]hh:mm:ss.SSS',{ stopTrim: "h" }) +} + +function log ( ...args ) { + console.log( elapsedStr( startTime ), ... args ) +} + +function warning ( ...args ) { + log( ...args ) +} + +function fatal ( ...args ) { + log( ...args ) + osProcess.exit( 1 ) +} + +function readUInt64LE(buf, offset) { + var lowBits = buf.readUInt32LE(offset); + var highBits = buf.readUInt32LE(offset + 4); + return highBits * 0x100000000 + lowBits +}; + +function blobPath(clusterIdx, blobIdx) { + return osPath.join(outPath, clusterIdx + '-' + blobIdx + '-blob'); +} + +function articlePath(article) { + return osPath.join(outPath, article.url); +} + +// +// class Reader +// +class Reader { + constructor ( path ) { + this.path = path; + this.position = 0; + this.file = fs.open( path, 'r' ) + + this.queue = genericPool.createPool( + { + async create () { return Symbol() }, + async destroy ( resource ) { }, + }, + {} + ) + } + + async read ( length, position ) { + + const token = await this.queue.acquire() + const fd = await this.file + + if (typeof position !== 'number') + position = this.position + this.position = position + length + + const data = Buffer.alloc(length) + const bytes = await fs.read( fd, data, 0, length, position ) + this.queue.release( token ) + return data + } + + async close () { + await this.queue.drain() + const fd = await this.file + await fs.close( fd ) + } + + tell () { + return this.position + } +} + +var headerLength = 80; + +var header = { + magicNumber: 72173914, // integer 0 4 Magic number to recognise the file format, must be 72173914 + version: 5, // integer 4 4 ZIM=5, bytes 1-2: major, bytes 3-4: minor version of the ZIM file format + uuid: 0, // integer 8 16 unique id of this zim file + articleCount: 0, // integer 24 4 total number of articles + clusterCount: 0, // integer 28 4 total number of clusters + urlPtrPos: 0, // integer 32 8 position of the directory pointerlist ordered by URL + titlePtrPos: 0, // integer 40 8 position of the directory pointerlist ordered by Title + clusterPtrPos: 0, // integer 48 8 position of the cluster pointer list + mimeListPos: headerLength, // integer 56 8 position of the MIME type list (also header size) + mainPage: 0xffffffff, // integer 64 4 main page or 0xffffffff if no main page + layoutPage: 0xffffffff, // integer 68 4 layout page or 0xffffffffff if no layout page + checksumPos: 0, // integer 72 8 pointer to the md5checksum of this file without the checksum itself. This points always 16 bytes before the end of the file. + geoIndexPos: 0, // integer 80 8 pointer to the geo index (optional). Present if mimeListPos is at least 80. +}; + +async function readHeader ( ) { + log('reading header') + const buf = await src.read( headerLength, 0 ) + + header.articleCount = buf.readUInt32LE(24); + header.clusterCount = buf.readUInt32LE(28); + + header.urlPtrPos = readUInt64LE(buf, 32); + header.titlePtrPos = readUInt64LE(buf, 40); + header.clusterPtrPos = readUInt64LE(buf, 48); + header.mimeListPos = readUInt64LE(buf, 56); + + header.mainPage = buf.readUInt32LE(64); + header.layoutPage = buf.readUInt32LE(68); + + log('header', header); +} + +async function processClusterList ( ) { + log('reading ClusterPointers') + const buf = await src.read( header.clusterCount * 8, header.clusterPtrPos ) + + try { + for ( let i=0; i < header.clusterCount; i++ ) { + await processCluster( buf, i ) + } + } catch ( err ) { + fatal( 'processClusterList', err ) + } +}; + +async function processCluster( buf, clusterIdx ) { + var eof = false; + + const clusterOfs = readUInt64LE( buf, clusterIdx * 8 ) + + async function readCompression () { + const buf = await src.read( 1, clusterOfs ) + + return buf.readUInt8(0) & 4; // xz compressed + } + + async function getSource( isCompressed ) { + var slice = fs.createReadStream( + src.path, + { + start: clusterOfs + 1, + // autoClose: false, + } + ); + + slice.on('error', function (err) { + console.error('processCluster', clusterIdx, 'input error', err); + //~ process.exit(1); + }); + + slice.on('end', function () { + log('processCluster', clusterIdx, 'input end'); + eof = true; + //~ process.exit(1); + }); + + slice.on('close', function () { + log('processCluster', clusterIdx, 'input closed'); + eof = true; + //~ process.exit(1); + }); + + slice.on('open', function (fd) { + log('processCluster', clusterIdx, 'input open', fd); + }); + + if ( isCompressed ) { // xz compressed + const decompressed = new lzma.Decompressor() + slice.pipe( decompressed ) + return decompressed + } + return slice + } + + async function readOffsets ( input ) { + const offsets = [] + let noffsets + for ( var buf; buf = await asyncRead( input, 4 );) { + var ofs = buf.readUInt32LE( 0 ) + if ( offsets.length == 0 ) { + noffsets = ofs / 4 + } + //~ log('readOffsets', clusterIdx, noffsets, offsets.length, ofs); + offsets.push(ofs) + + if ( offsets.length == noffsets ) { + //~ log('readOffsets done', clusterIdx, noffsets, offsets.length, ofs); + return offsets + } + } + fatal( 'readOffsets prematire stream end' ) + } + + async function dumpBlobs ( input, offsets ) { + for ( let i=0; i < offsets.length-1; i++ ) { + + const blobLen = offsets[ i + 1 ] - offsets[ i ] + const blob = blobLen === 0 ? + Buffer.alloc(0) + : await asyncRead( input, blobLen ) + await fs.outputFile( blobPath( clusterIdx, i ), blob ) + + //~ log('readBlobs', clusterIdx, isCompressed, nblobs, i, blobLen) + } + + //~ log('readBlobs done', clusterIdx, isCompressed, nblobs, blobIdx, blobLen) + } + + let input + + try { + const isCompressed = await readCompression() + log('processCluster', clusterIdx, header.clusterCount, isCompressed); + + input = await getSource( isCompressed ) + const offsets = await readOffsets( input ) + await dumpBlobs( input, offsets ) + } catch ( err ) { + if (!eof) { + //~ slice.fd = null; + input && input.destroy() + } + fatal( 'processCluster error', clusterIdx, header.clusterCount, err ) + } +} + +async function getDirEntry ( article ) { + let chunkLen = 512; + let dirEntry + + function parseDirEntry () { + article.mimeIdx = dirEntry.readUInt16LE(0); + article.nameSpace = dirEntry.toString('utf8', 3, 4); + + var strOfs = 16; + if (article.mimeIdx == 0xfffe || article.mimeIdx == 0xfffd) { + // linktarget or deleted entry + return true // noop + } else if (article.mimeIdx == 0xffff ) { //redirect + strOfs = 12; + article.redirectIndex = dirEntry.readUInt32LE(8); + } else { + article.clusterIdx = dirEntry.readUInt32LE(8); + article.blobIdx = dirEntry.readUInt32LE(12); + } + + // read url and title + var end = dirEntry.indexOf(0, strOfs); + if (end != -1) { + article.url = dirEntry.toString('utf8', strOfs, end); + + var strOfs = end + 1; + end = dirEntry.indexOf(0, strOfs); + if (end != -1) { + article.title = dirEntry.toString('utf8', strOfs, end); + } + } + + if (end == -1) // short buffer -- read more + return false + + log('parseDirEntry', article.index, header.articleCount, '\n', article); + + articles[article.index] = article + + return true + } + + try { + while ( true ) { + dirEntry = await src.read( chunkLen, article.offset ) + if ( parseDirEntry() ) + return article + chunkLen *= 2 + } + } catch ( err ) { + fatal( 'processdirEntry read error', article.index, header.articleCount, err ) + } +} + +async function renameBlob( article ) { + + var bpath = blobPath(article.clusterIdx, article.blobIdx) + + if (article.nameSpace == 'M') { // metadata + const data = await fs.readFile ( bpath, 'utf8' ) + metadata.push([article.url.toLowerCase(), data]) + return fs.unlink( bpath ) + } + const apath = articlePath( article ) + + log('renameBlob', article.index, header.articleCount, bpath, '->', apath ) + + return fs.move( bpath, apath, { clobber: true }) +} + +async function loadArticle( article ) { + if (article.nameSpace != 'A') + return null + const data = await fs.readFile( articlePath( article )) + + try { + const dom = cheerio.load( data ) + return dom + } catch ( e ) { + log( 'cheerio.load error', e, data ) + return null + } +} + +var nameSpaces = ['-', 'A', 'B', 'I', 'J', 'M', 'U', 'W', 'X']; + +function alterLinks( article, dom ) { + var nameSpaceLink = function (elem, attr) { + let link + try { + link = url.parse(elem.attribs[attr], true, true) + } catch (err) { + //~ console.error('alterLinks error', err, article, attr, elem.attribs[attr], elem) + console.error('alterLinks', err.message, elem.attribs[attr], 'at', article.path) + return + } + if ( (link.protocol && link.protocol != 'http:' && link.protocol != 'https:') + || link.host || ! link.pathname) + return + + var chunks = link.pathname.split('/') + + if ( chunks[0] == '' // abs path + || chunks[0] == '..' + && nameSpaces.indexOf(chunks[1]) != -1) { + chunks.shift(); + chunks.shift(); + link.pathname = chunks.join('/'); + //~ log('alterLinks', elem.attribs[attr], url.format(link)); + elem.attribs[attr] = url.format(link); + return // OK + } + return + } + + dom( '[src]' ).each( (i, elem) => nameSpaceLink( elem, 'src' )) + dom( '[href]' ).each( (i, elem) => nameSpaceLink( elem, 'href' )) +} + +async function processArticle ( articleIndex ) { + if ( articles[ articleIndex ] != null ) + return true // already processed + + const article = { + index: articleIndex, + offset: readUInt64LE( rawDirectory, articleIndex * 8 ) + } + + await getDirEntry( article ) + + if ( article.mimeIdx == 0xfffe || article.mimeIdx == 0xfffd ) { + // linktarget or deleted entry + return true // noop + } + if ( article.mimeIdx == 0xffff ) { //redirect + return storeRedirect( article ) + } + + const moved = await renameBlob( article ) + if (! moved ) + return null + const dom = await loadArticle( article ) + if (! dom ) + return null + await alterLinks( article, dom ) + return fs.outputFile( articlePath( article ), Buffer.from( dom.html() )) +} + +var rawDirectory + +async function processArticleList () { + log('reading ArticleList') + articles = Array( header.articleCount ) + rawDirectory = await src.read(header.articleCount * 8, header.urlPtrPos ) + + //~ log( 'articleOffsets', articleOffsets); + + for ( let i=0; i < header.articleCount; i++ ) { + await processArticle( i ) + } + log( '*** articles' ) + articles.forEach( (val, i ) => log( i, val.nameSpace, val.url )) + + if ( redirectOut ) + return new Promise( ( resolve, reject ) => { + redirectOut.end( resolve ) + }) +} + +async function processTitleList () { + log('reading Title List') + const titleDirectory = await src.read( header.articleCount * 4, header.titlePtrPos ) + + //~ log( 'articleOffsets', articleOffsets); + log( '*** titles' ) + + for ( let i=0; i < header.articleCount; i++ ) { + const idx = titleDirectory.readUInt32LE( i * 4 ) + log( i, idx, articles[ idx ].nameSpace, articles[ idx ].title, '>', articles[ idx ].url ) + } +} + +var redirectOut = null + +function storeRedirect ( article ) { + log('storeRedirect', article) + + if (article.nameSpace == '-' && (article.url == 'favicon' || article.url == 'mainPage')) + return + + if (! redirectOut) { + redirectOut = csvOutput({delimiter: '\t'}) + redirectOut.pipe(fs.createWriteStream(osPath.join(outPath, '..', 'redirects.csv'))) + } + + var target = articles[ article.redirectIndex ] + if (! target) { // fetch target artcile isn't yet processed + return processArticle( article.redirectIndex ) + .then(() => storeRedirect( article )) + } + + var item = [ article.nameSpace, article.url, article.title, target.url ] + + log('storeRedirect', item) + + return new Promise(( resolve, reject ) => { + var write = function () { + try { + if (! redirectOut.write(item)) + return redirectOut.once('drain', write) + resolve( false ) + } catch ( err ) { + reject( err ) + } + } + write() + }) +} + +function storeMetadata () { + log('storeMetadata'); + if ( metadata.length == 0 ) + return + + var csv = csvOutput({ delimiter: ' ' }) + csv.pipe( fs.createWriteStream( osPath.join( outPath, '..', 'metadata.csv' ))) + + return new Promise(( resolve, reject ) => { + var write = function () { + try { + var i = 0; + var write = function () { + while (true) { + if ( i == metadata.length ) { + log('storeMetadata finished'); + return csv.end( resolve ); + } + var item = metadata[i]; + log('storeMetadata', metadata.length, i, item); + if (! csv.write( item )) + break; + i++ + } + csv.once( 'drain', write ) + } + } catch ( err ) { + reject( err ) + } + } + write() + }) +} + +async function core () { + src = new Reader(srcPath) + + await readHeader( ) + await processClusterList() + await processArticleList() + await processTitleList() + await storeMetadata() + + await src.close() +} + +function main () { + command + .version( packageInfo.version ) + .arguments( '' ) + .description( 'Dumps a ZIM file' ) + .option( '-h -help' ) + .parse( process.argv ) + + log( command.opts() ) + + srcPath = expandHomeDir( command.args[0] ) + outPath = expandHomeDir( command.args[1] ) + if (! outPath ) { + var parsed = osPath.parse(srcPath) + outPath = parsed.name + } + + core() +} + +main ()