unzimmer.js

This commit is contained in:
v 2018-11-21 19:36:58 +03:00
parent 08a1b6ca5c
commit 4b842a8b41
2 changed files with 557 additions and 5 deletions

View File

@ -19,6 +19,7 @@
"child-process": "*",
"commander": "^2.11.0",
"csv-parse": "*",
"csv-stringify": "^4.3.1",
"encodeurl": "^1.0.1",
"expand-home-dir": "*",
"fs-extra": "^3.0.1",
@ -33,17 +34,18 @@
"moment-duration-format": "^2.2.2",
"mozjpeg": "*",
"mz": "^2.6.0",
"promised-read": "^2.0.1",
"quick-lru": "^1.0.0",
"request": "^2.81.0",
"request-promise": "^4.2.1",
"request-promise": "*",
"sanitize-filename": "^1.6.1",
"sharp": "^0.17.3",
"sqlite": "^2.8.0",
"sqlite3": "*",
"uuid": "*"
"uuid": "*",
"xz": "^1.3.0"
},
"engines" : {
"node" : ">=8.0.0"
"engines": {
"node": ">=8.0.0"
},
"bin": {
"zimmer": "./zimmer.js",

550
unzimmer.js Executable file
View File

@ -0,0 +1,550 @@
#!/bin/sh
":" //# -*- mode: js -*-; exec /usr/bin/env node --max-old-space-size=9000 --stack-size=42000 "$0" "$@"
"use strict";
/************************************/
/* MODULE VARIABLE SECTION **********/
/************************************/
const fs = require( 'fs-extra' )
var mimeDb = require( 'mime-db' );
var mime = require( 'mime-types' );
const packageInfo = require('./package.json');
const genericPool = require( 'generic-pool' )
const asyncRead = require('promised-read').read
const cheerio = require('cheerio')
const command = require('commander')
const osProcess = require('process')
var osPath = require( 'path' );
var expandHomeDir = require( 'expand-home-dir' );
//~ var lzma = require('lzma-native');
var lzma = require('xz');
//~ var lzma = require('node-liblzma');
var csvOutput = require('csv-stringify');
const moment = require("moment")
require("moment-duration-format")
var srcPath;
var outPath;
var src; // input file reader
var articles = null;
var metadata = [];
const startTime = Date.now()
function elapsedStr( from , to = Date.now()) {
return moment.duration( to - from ).format('d[d]hh:mm:ss.SSS',{ stopTrim: "h" })
}
function log ( ...args ) {
console.log( elapsedStr( startTime ), ... args )
}
function warning ( ...args ) {
log( ...args )
}
function fatal ( ...args ) {
log( ...args )
osProcess.exit( 1 )
}
function readUInt64LE(buf, offset) {
var lowBits = buf.readUInt32LE(offset);
var highBits = buf.readUInt32LE(offset + 4);
return highBits * 0x100000000 + lowBits
};
function blobPath(clusterIdx, blobIdx) {
return osPath.join(outPath, clusterIdx + '-' + blobIdx + '-blob');
}
function articlePath(article) {
return osPath.join(outPath, article.url);
}
//
// class Reader
//
class Reader {
constructor ( path ) {
this.path = path;
this.position = 0;
this.file = fs.open( path, 'r' )
this.queue = genericPool.createPool(
{
async create () { return Symbol() },
async destroy ( resource ) { },
},
{}
)
}
async read ( length, position ) {
const token = await this.queue.acquire()
const fd = await this.file
if (typeof position !== 'number')
position = this.position
this.position = position + length
const data = Buffer.alloc(length)
const bytes = await fs.read( fd, data, 0, length, position )
this.queue.release( token )
return data
}
async close () {
await this.queue.drain()
const fd = await this.file
await fs.close( fd )
}
tell () {
return this.position
}
}
var headerLength = 80;
var header = {
magicNumber: 72173914, // integer 0 4 Magic number to recognise the file format, must be 72173914
version: 5, // integer 4 4 ZIM=5, bytes 1-2: major, bytes 3-4: minor version of the ZIM file format
uuid: 0, // integer 8 16 unique id of this zim file
articleCount: 0, // integer 24 4 total number of articles
clusterCount: 0, // integer 28 4 total number of clusters
urlPtrPos: 0, // integer 32 8 position of the directory pointerlist ordered by URL
titlePtrPos: 0, // integer 40 8 position of the directory pointerlist ordered by Title
clusterPtrPos: 0, // integer 48 8 position of the cluster pointer list
mimeListPos: headerLength, // integer 56 8 position of the MIME type list (also header size)
mainPage: 0xffffffff, // integer 64 4 main page or 0xffffffff if no main page
layoutPage: 0xffffffff, // integer 68 4 layout page or 0xffffffffff if no layout page
checksumPos: 0, // integer 72 8 pointer to the md5checksum of this file without the checksum itself. This points always 16 bytes before the end of the file.
geoIndexPos: 0, // integer 80 8 pointer to the geo index (optional). Present if mimeListPos is at least 80.
};
async function readHeader ( ) {
log('reading header')
const buf = await src.read( headerLength, 0 )
header.articleCount = buf.readUInt32LE(24);
header.clusterCount = buf.readUInt32LE(28);
header.urlPtrPos = readUInt64LE(buf, 32);
header.titlePtrPos = readUInt64LE(buf, 40);
header.clusterPtrPos = readUInt64LE(buf, 48);
header.mimeListPos = readUInt64LE(buf, 56);
header.mainPage = buf.readUInt32LE(64);
header.layoutPage = buf.readUInt32LE(68);
log('header', header);
}
async function processClusterList ( ) {
log('reading ClusterPointers')
const buf = await src.read( header.clusterCount * 8, header.clusterPtrPos )
try {
for ( let i=0; i < header.clusterCount; i++ ) {
await processCluster( buf, i )
}
} catch ( err ) {
fatal( 'processClusterList', err )
}
};
async function processCluster( buf, clusterIdx ) {
var eof = false;
const clusterOfs = readUInt64LE( buf, clusterIdx * 8 )
async function readCompression () {
const buf = await src.read( 1, clusterOfs )
return buf.readUInt8(0) & 4; // xz compressed
}
async function getSource( isCompressed ) {
var slice = fs.createReadStream(
src.path,
{
start: clusterOfs + 1,
// autoClose: false,
}
);
slice.on('error', function (err) {
console.error('processCluster', clusterIdx, 'input error', err);
//~ process.exit(1);
});
slice.on('end', function () {
log('processCluster', clusterIdx, 'input end');
eof = true;
//~ process.exit(1);
});
slice.on('close', function () {
log('processCluster', clusterIdx, 'input closed');
eof = true;
//~ process.exit(1);
});
slice.on('open', function (fd) {
log('processCluster', clusterIdx, 'input open', fd);
});
if ( isCompressed ) { // xz compressed
const decompressed = new lzma.Decompressor()
slice.pipe( decompressed )
return decompressed
}
return slice
}
async function readOffsets ( input ) {
const offsets = []
let noffsets
for ( var buf; buf = await asyncRead( input, 4 );) {
var ofs = buf.readUInt32LE( 0 )
if ( offsets.length == 0 ) {
noffsets = ofs / 4
}
//~ log('readOffsets', clusterIdx, noffsets, offsets.length, ofs);
offsets.push(ofs)
if ( offsets.length == noffsets ) {
//~ log('readOffsets done', clusterIdx, noffsets, offsets.length, ofs);
return offsets
}
}
fatal( 'readOffsets prematire stream end' )
}
async function dumpBlobs ( input, offsets ) {
for ( let i=0; i < offsets.length-1; i++ ) {
const blobLen = offsets[ i + 1 ] - offsets[ i ]
const blob = blobLen === 0 ?
Buffer.alloc(0)
: await asyncRead( input, blobLen )
await fs.outputFile( blobPath( clusterIdx, i ), blob )
//~ log('readBlobs', clusterIdx, isCompressed, nblobs, i, blobLen)
}
//~ log('readBlobs done', clusterIdx, isCompressed, nblobs, blobIdx, blobLen)
}
let input
try {
const isCompressed = await readCompression()
log('processCluster', clusterIdx, header.clusterCount, isCompressed);
input = await getSource( isCompressed )
const offsets = await readOffsets( input )
await dumpBlobs( input, offsets )
} catch ( err ) {
if (!eof) {
//~ slice.fd = null;
input && input.destroy()
}
fatal( 'processCluster error', clusterIdx, header.clusterCount, err )
}
}
async function getDirEntry ( article ) {
let chunkLen = 512;
let dirEntry
function parseDirEntry () {
article.mimeIdx = dirEntry.readUInt16LE(0);
article.nameSpace = dirEntry.toString('utf8', 3, 4);
var strOfs = 16;
if (article.mimeIdx == 0xfffe || article.mimeIdx == 0xfffd) {
// linktarget or deleted entry
return true // noop
} else if (article.mimeIdx == 0xffff ) { //redirect
strOfs = 12;
article.redirectIndex = dirEntry.readUInt32LE(8);
} else {
article.clusterIdx = dirEntry.readUInt32LE(8);
article.blobIdx = dirEntry.readUInt32LE(12);
}
// read url and title
var end = dirEntry.indexOf(0, strOfs);
if (end != -1) {
article.url = dirEntry.toString('utf8', strOfs, end);
var strOfs = end + 1;
end = dirEntry.indexOf(0, strOfs);
if (end != -1) {
article.title = dirEntry.toString('utf8', strOfs, end);
}
}
if (end == -1) // short buffer -- read more
return false
log('parseDirEntry', article.index, header.articleCount, '\n', article);
articles[article.index] = article
return true
}
try {
while ( true ) {
dirEntry = await src.read( chunkLen, article.offset )
if ( parseDirEntry() )
return article
chunkLen *= 2
}
} catch ( err ) {
fatal( 'processdirEntry read error', article.index, header.articleCount, err )
}
}
async function renameBlob( article ) {
var bpath = blobPath(article.clusterIdx, article.blobIdx)
if (article.nameSpace == 'M') { // metadata
const data = await fs.readFile ( bpath, 'utf8' )
metadata.push([article.url.toLowerCase(), data])
return fs.unlink( bpath )
}
const apath = articlePath( article )
log('renameBlob', article.index, header.articleCount, bpath, '->', apath )
return fs.move( bpath, apath, { clobber: true })
}
async function loadArticle( article ) {
if (article.nameSpace != 'A')
return null
const data = await fs.readFile( articlePath( article ))
try {
const dom = cheerio.load( data )
return dom
} catch ( e ) {
log( 'cheerio.load error', e, data )
return null
}
}
var nameSpaces = ['-', 'A', 'B', 'I', 'J', 'M', 'U', 'W', 'X'];
function alterLinks( article, dom ) {
var nameSpaceLink = function (elem, attr) {
let link
try {
link = url.parse(elem.attribs[attr], true, true)
} catch (err) {
//~ console.error('alterLinks error', err, article, attr, elem.attribs[attr], elem)
console.error('alterLinks', err.message, elem.attribs[attr], 'at', article.path)
return
}
if ( (link.protocol && link.protocol != 'http:' && link.protocol != 'https:')
|| link.host || ! link.pathname)
return
var chunks = link.pathname.split('/')
if ( chunks[0] == '' // abs path
|| chunks[0] == '..'
&& nameSpaces.indexOf(chunks[1]) != -1) {
chunks.shift();
chunks.shift();
link.pathname = chunks.join('/');
//~ log('alterLinks', elem.attribs[attr], url.format(link));
elem.attribs[attr] = url.format(link);
return // OK
}
return
}
dom( '[src]' ).each( (i, elem) => nameSpaceLink( elem, 'src' ))
dom( '[href]' ).each( (i, elem) => nameSpaceLink( elem, 'href' ))
}
async function processArticle ( articleIndex ) {
if ( articles[ articleIndex ] != null )
return true // already processed
const article = {
index: articleIndex,
offset: readUInt64LE( rawDirectory, articleIndex * 8 )
}
await getDirEntry( article )
if ( article.mimeIdx == 0xfffe || article.mimeIdx == 0xfffd ) {
// linktarget or deleted entry
return true // noop
}
if ( article.mimeIdx == 0xffff ) { //redirect
return storeRedirect( article )
}
const moved = await renameBlob( article )
if (! moved )
return null
const dom = await loadArticle( article )
if (! dom )
return null
await alterLinks( article, dom )
return fs.outputFile( articlePath( article ), Buffer.from( dom.html() ))
}
var rawDirectory
async function processArticleList () {
log('reading ArticleList')
articles = Array( header.articleCount )
rawDirectory = await src.read(header.articleCount * 8, header.urlPtrPos )
//~ log( 'articleOffsets', articleOffsets);
for ( let i=0; i < header.articleCount; i++ ) {
await processArticle( i )
}
log( '*** articles' )
articles.forEach( (val, i ) => log( i, val.nameSpace, val.url ))
if ( redirectOut )
return new Promise( ( resolve, reject ) => {
redirectOut.end( resolve )
})
}
async function processTitleList () {
log('reading Title List')
const titleDirectory = await src.read( header.articleCount * 4, header.titlePtrPos )
//~ log( 'articleOffsets', articleOffsets);
log( '*** titles' )
for ( let i=0; i < header.articleCount; i++ ) {
const idx = titleDirectory.readUInt32LE( i * 4 )
log( i, idx, articles[ idx ].nameSpace, articles[ idx ].title, '>', articles[ idx ].url )
}
}
var redirectOut = null
function storeRedirect ( article ) {
log('storeRedirect', article)
if (article.nameSpace == '-' && (article.url == 'favicon' || article.url == 'mainPage'))
return
if (! redirectOut) {
redirectOut = csvOutput({delimiter: '\t'})
redirectOut.pipe(fs.createWriteStream(osPath.join(outPath, '..', 'redirects.csv')))
}
var target = articles[ article.redirectIndex ]
if (! target) { // fetch target artcile isn't yet processed
return processArticle( article.redirectIndex )
.then(() => storeRedirect( article ))
}
var item = [ article.nameSpace, article.url, article.title, target.url ]
log('storeRedirect', item)
return new Promise(( resolve, reject ) => {
var write = function () {
try {
if (! redirectOut.write(item))
return redirectOut.once('drain', write)
resolve( false )
} catch ( err ) {
reject( err )
}
}
write()
})
}
function storeMetadata () {
log('storeMetadata');
if ( metadata.length == 0 )
return
var csv = csvOutput({ delimiter: ' ' })
csv.pipe( fs.createWriteStream( osPath.join( outPath, '..', 'metadata.csv' )))
return new Promise(( resolve, reject ) => {
var write = function () {
try {
var i = 0;
var write = function () {
while (true) {
if ( i == metadata.length ) {
log('storeMetadata finished');
return csv.end( resolve );
}
var item = metadata[i];
log('storeMetadata', metadata.length, i, item);
if (! csv.write( item ))
break;
i++
}
csv.once( 'drain', write )
}
} catch ( err ) {
reject( err )
}
}
write()
})
}
async function core () {
src = new Reader(srcPath)
await readHeader( )
await processClusterList()
await processArticleList()
await processTitleList()
await storeMetadata()
await src.close()
}
function main () {
command
.version( packageInfo.version )
.arguments( '<wiki-page-URL>' )
.description( 'Dumps a ZIM file' )
.option( '-h -help' )
.parse( process.argv )
log( command.opts() )
srcPath = expandHomeDir( command.args[0] )
outPath = expandHomeDir( command.args[1] )
if (! outPath ) {
var parsed = osPath.parse(srcPath)
outPath = parsed.name
}
core()
}
main ()