unzimmer.js
This commit is contained in:
parent
08a1b6ca5c
commit
4b842a8b41
12
package.json
12
package.json
|
@ -19,6 +19,7 @@
|
|||
"child-process": "*",
|
||||
"commander": "^2.11.0",
|
||||
"csv-parse": "*",
|
||||
"csv-stringify": "^4.3.1",
|
||||
"encodeurl": "^1.0.1",
|
||||
"expand-home-dir": "*",
|
||||
"fs-extra": "^3.0.1",
|
||||
|
@ -33,17 +34,18 @@
|
|||
"moment-duration-format": "^2.2.2",
|
||||
"mozjpeg": "*",
|
||||
"mz": "^2.6.0",
|
||||
"promised-read": "^2.0.1",
|
||||
"quick-lru": "^1.0.0",
|
||||
"request": "^2.81.0",
|
||||
"request-promise": "^4.2.1",
|
||||
"request-promise": "*",
|
||||
"sanitize-filename": "^1.6.1",
|
||||
"sharp": "^0.17.3",
|
||||
"sqlite": "^2.8.0",
|
||||
"sqlite3": "*",
|
||||
"uuid": "*"
|
||||
"uuid": "*",
|
||||
"xz": "^1.3.0"
|
||||
},
|
||||
"engines" : {
|
||||
"node" : ">=8.0.0"
|
||||
"engines": {
|
||||
"node": ">=8.0.0"
|
||||
},
|
||||
"bin": {
|
||||
"zimmer": "./zimmer.js",
|
||||
|
|
550
unzimmer.js
Executable file
550
unzimmer.js
Executable file
|
@ -0,0 +1,550 @@
|
|||
#!/bin/sh
|
||||
":" //# -*- mode: js -*-; exec /usr/bin/env node --max-old-space-size=9000 --stack-size=42000 "$0" "$@"
|
||||
|
||||
"use strict";
|
||||
|
||||
/************************************/
|
||||
/* MODULE VARIABLE SECTION **********/
|
||||
/************************************/
|
||||
|
||||
const fs = require( 'fs-extra' )
|
||||
var mimeDb = require( 'mime-db' );
|
||||
var mime = require( 'mime-types' );
|
||||
|
||||
const packageInfo = require('./package.json');
|
||||
const genericPool = require( 'generic-pool' )
|
||||
const asyncRead = require('promised-read').read
|
||||
const cheerio = require('cheerio')
|
||||
const command = require('commander')
|
||||
|
||||
const osProcess = require('process')
|
||||
var osPath = require( 'path' );
|
||||
var expandHomeDir = require( 'expand-home-dir' );
|
||||
//~ var lzma = require('lzma-native');
|
||||
var lzma = require('xz');
|
||||
//~ var lzma = require('node-liblzma');
|
||||
var csvOutput = require('csv-stringify');
|
||||
|
||||
const moment = require("moment")
|
||||
require("moment-duration-format")
|
||||
|
||||
var srcPath;
|
||||
var outPath;
|
||||
var src; // input file reader
|
||||
|
||||
var articles = null;
|
||||
var metadata = [];
|
||||
|
||||
const startTime = Date.now()
|
||||
|
||||
function elapsedStr( from , to = Date.now()) {
|
||||
return moment.duration( to - from ).format('d[d]hh:mm:ss.SSS',{ stopTrim: "h" })
|
||||
}
|
||||
|
||||
function log ( ...args ) {
|
||||
console.log( elapsedStr( startTime ), ... args )
|
||||
}
|
||||
|
||||
function warning ( ...args ) {
|
||||
log( ...args )
|
||||
}
|
||||
|
||||
function fatal ( ...args ) {
|
||||
log( ...args )
|
||||
osProcess.exit( 1 )
|
||||
}
|
||||
|
||||
function readUInt64LE(buf, offset) {
|
||||
var lowBits = buf.readUInt32LE(offset);
|
||||
var highBits = buf.readUInt32LE(offset + 4);
|
||||
return highBits * 0x100000000 + lowBits
|
||||
};
|
||||
|
||||
function blobPath(clusterIdx, blobIdx) {
|
||||
return osPath.join(outPath, clusterIdx + '-' + blobIdx + '-blob');
|
||||
}
|
||||
|
||||
function articlePath(article) {
|
||||
return osPath.join(outPath, article.url);
|
||||
}
|
||||
|
||||
//
|
||||
// class Reader
|
||||
//
|
||||
class Reader {
|
||||
constructor ( path ) {
|
||||
this.path = path;
|
||||
this.position = 0;
|
||||
this.file = fs.open( path, 'r' )
|
||||
|
||||
this.queue = genericPool.createPool(
|
||||
{
|
||||
async create () { return Symbol() },
|
||||
async destroy ( resource ) { },
|
||||
},
|
||||
{}
|
||||
)
|
||||
}
|
||||
|
||||
async read ( length, position ) {
|
||||
|
||||
const token = await this.queue.acquire()
|
||||
const fd = await this.file
|
||||
|
||||
if (typeof position !== 'number')
|
||||
position = this.position
|
||||
this.position = position + length
|
||||
|
||||
const data = Buffer.alloc(length)
|
||||
const bytes = await fs.read( fd, data, 0, length, position )
|
||||
this.queue.release( token )
|
||||
return data
|
||||
}
|
||||
|
||||
async close () {
|
||||
await this.queue.drain()
|
||||
const fd = await this.file
|
||||
await fs.close( fd )
|
||||
}
|
||||
|
||||
tell () {
|
||||
return this.position
|
||||
}
|
||||
}
|
||||
|
||||
var headerLength = 80;
|
||||
|
||||
var header = {
|
||||
magicNumber: 72173914, // integer 0 4 Magic number to recognise the file format, must be 72173914
|
||||
version: 5, // integer 4 4 ZIM=5, bytes 1-2: major, bytes 3-4: minor version of the ZIM file format
|
||||
uuid: 0, // integer 8 16 unique id of this zim file
|
||||
articleCount: 0, // integer 24 4 total number of articles
|
||||
clusterCount: 0, // integer 28 4 total number of clusters
|
||||
urlPtrPos: 0, // integer 32 8 position of the directory pointerlist ordered by URL
|
||||
titlePtrPos: 0, // integer 40 8 position of the directory pointerlist ordered by Title
|
||||
clusterPtrPos: 0, // integer 48 8 position of the cluster pointer list
|
||||
mimeListPos: headerLength, // integer 56 8 position of the MIME type list (also header size)
|
||||
mainPage: 0xffffffff, // integer 64 4 main page or 0xffffffff if no main page
|
||||
layoutPage: 0xffffffff, // integer 68 4 layout page or 0xffffffffff if no layout page
|
||||
checksumPos: 0, // integer 72 8 pointer to the md5checksum of this file without the checksum itself. This points always 16 bytes before the end of the file.
|
||||
geoIndexPos: 0, // integer 80 8 pointer to the geo index (optional). Present if mimeListPos is at least 80.
|
||||
};
|
||||
|
||||
async function readHeader ( ) {
|
||||
log('reading header')
|
||||
const buf = await src.read( headerLength, 0 )
|
||||
|
||||
header.articleCount = buf.readUInt32LE(24);
|
||||
header.clusterCount = buf.readUInt32LE(28);
|
||||
|
||||
header.urlPtrPos = readUInt64LE(buf, 32);
|
||||
header.titlePtrPos = readUInt64LE(buf, 40);
|
||||
header.clusterPtrPos = readUInt64LE(buf, 48);
|
||||
header.mimeListPos = readUInt64LE(buf, 56);
|
||||
|
||||
header.mainPage = buf.readUInt32LE(64);
|
||||
header.layoutPage = buf.readUInt32LE(68);
|
||||
|
||||
log('header', header);
|
||||
}
|
||||
|
||||
async function processClusterList ( ) {
|
||||
log('reading ClusterPointers')
|
||||
const buf = await src.read( header.clusterCount * 8, header.clusterPtrPos )
|
||||
|
||||
try {
|
||||
for ( let i=0; i < header.clusterCount; i++ ) {
|
||||
await processCluster( buf, i )
|
||||
}
|
||||
} catch ( err ) {
|
||||
fatal( 'processClusterList', err )
|
||||
}
|
||||
};
|
||||
|
||||
async function processCluster( buf, clusterIdx ) {
|
||||
var eof = false;
|
||||
|
||||
const clusterOfs = readUInt64LE( buf, clusterIdx * 8 )
|
||||
|
||||
async function readCompression () {
|
||||
const buf = await src.read( 1, clusterOfs )
|
||||
|
||||
return buf.readUInt8(0) & 4; // xz compressed
|
||||
}
|
||||
|
||||
async function getSource( isCompressed ) {
|
||||
var slice = fs.createReadStream(
|
||||
src.path,
|
||||
{
|
||||
start: clusterOfs + 1,
|
||||
// autoClose: false,
|
||||
}
|
||||
);
|
||||
|
||||
slice.on('error', function (err) {
|
||||
console.error('processCluster', clusterIdx, 'input error', err);
|
||||
//~ process.exit(1);
|
||||
});
|
||||
|
||||
slice.on('end', function () {
|
||||
log('processCluster', clusterIdx, 'input end');
|
||||
eof = true;
|
||||
//~ process.exit(1);
|
||||
});
|
||||
|
||||
slice.on('close', function () {
|
||||
log('processCluster', clusterIdx, 'input closed');
|
||||
eof = true;
|
||||
//~ process.exit(1);
|
||||
});
|
||||
|
||||
slice.on('open', function (fd) {
|
||||
log('processCluster', clusterIdx, 'input open', fd);
|
||||
});
|
||||
|
||||
if ( isCompressed ) { // xz compressed
|
||||
const decompressed = new lzma.Decompressor()
|
||||
slice.pipe( decompressed )
|
||||
return decompressed
|
||||
}
|
||||
return slice
|
||||
}
|
||||
|
||||
async function readOffsets ( input ) {
|
||||
const offsets = []
|
||||
let noffsets
|
||||
for ( var buf; buf = await asyncRead( input, 4 );) {
|
||||
var ofs = buf.readUInt32LE( 0 )
|
||||
if ( offsets.length == 0 ) {
|
||||
noffsets = ofs / 4
|
||||
}
|
||||
//~ log('readOffsets', clusterIdx, noffsets, offsets.length, ofs);
|
||||
offsets.push(ofs)
|
||||
|
||||
if ( offsets.length == noffsets ) {
|
||||
//~ log('readOffsets done', clusterIdx, noffsets, offsets.length, ofs);
|
||||
return offsets
|
||||
}
|
||||
}
|
||||
fatal( 'readOffsets prematire stream end' )
|
||||
}
|
||||
|
||||
async function dumpBlobs ( input, offsets ) {
|
||||
for ( let i=0; i < offsets.length-1; i++ ) {
|
||||
|
||||
const blobLen = offsets[ i + 1 ] - offsets[ i ]
|
||||
const blob = blobLen === 0 ?
|
||||
Buffer.alloc(0)
|
||||
: await asyncRead( input, blobLen )
|
||||
await fs.outputFile( blobPath( clusterIdx, i ), blob )
|
||||
|
||||
//~ log('readBlobs', clusterIdx, isCompressed, nblobs, i, blobLen)
|
||||
}
|
||||
|
||||
//~ log('readBlobs done', clusterIdx, isCompressed, nblobs, blobIdx, blobLen)
|
||||
}
|
||||
|
||||
let input
|
||||
|
||||
try {
|
||||
const isCompressed = await readCompression()
|
||||
log('processCluster', clusterIdx, header.clusterCount, isCompressed);
|
||||
|
||||
input = await getSource( isCompressed )
|
||||
const offsets = await readOffsets( input )
|
||||
await dumpBlobs( input, offsets )
|
||||
} catch ( err ) {
|
||||
if (!eof) {
|
||||
//~ slice.fd = null;
|
||||
input && input.destroy()
|
||||
}
|
||||
fatal( 'processCluster error', clusterIdx, header.clusterCount, err )
|
||||
}
|
||||
}
|
||||
|
||||
async function getDirEntry ( article ) {
|
||||
let chunkLen = 512;
|
||||
let dirEntry
|
||||
|
||||
function parseDirEntry () {
|
||||
article.mimeIdx = dirEntry.readUInt16LE(0);
|
||||
article.nameSpace = dirEntry.toString('utf8', 3, 4);
|
||||
|
||||
var strOfs = 16;
|
||||
if (article.mimeIdx == 0xfffe || article.mimeIdx == 0xfffd) {
|
||||
// linktarget or deleted entry
|
||||
return true // noop
|
||||
} else if (article.mimeIdx == 0xffff ) { //redirect
|
||||
strOfs = 12;
|
||||
article.redirectIndex = dirEntry.readUInt32LE(8);
|
||||
} else {
|
||||
article.clusterIdx = dirEntry.readUInt32LE(8);
|
||||
article.blobIdx = dirEntry.readUInt32LE(12);
|
||||
}
|
||||
|
||||
// read url and title
|
||||
var end = dirEntry.indexOf(0, strOfs);
|
||||
if (end != -1) {
|
||||
article.url = dirEntry.toString('utf8', strOfs, end);
|
||||
|
||||
var strOfs = end + 1;
|
||||
end = dirEntry.indexOf(0, strOfs);
|
||||
if (end != -1) {
|
||||
article.title = dirEntry.toString('utf8', strOfs, end);
|
||||
}
|
||||
}
|
||||
|
||||
if (end == -1) // short buffer -- read more
|
||||
return false
|
||||
|
||||
log('parseDirEntry', article.index, header.articleCount, '\n', article);
|
||||
|
||||
articles[article.index] = article
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
try {
|
||||
while ( true ) {
|
||||
dirEntry = await src.read( chunkLen, article.offset )
|
||||
if ( parseDirEntry() )
|
||||
return article
|
||||
chunkLen *= 2
|
||||
}
|
||||
} catch ( err ) {
|
||||
fatal( 'processdirEntry read error', article.index, header.articleCount, err )
|
||||
}
|
||||
}
|
||||
|
||||
async function renameBlob( article ) {
|
||||
|
||||
var bpath = blobPath(article.clusterIdx, article.blobIdx)
|
||||
|
||||
if (article.nameSpace == 'M') { // metadata
|
||||
const data = await fs.readFile ( bpath, 'utf8' )
|
||||
metadata.push([article.url.toLowerCase(), data])
|
||||
return fs.unlink( bpath )
|
||||
}
|
||||
const apath = articlePath( article )
|
||||
|
||||
log('renameBlob', article.index, header.articleCount, bpath, '->', apath )
|
||||
|
||||
return fs.move( bpath, apath, { clobber: true })
|
||||
}
|
||||
|
||||
async function loadArticle( article ) {
|
||||
if (article.nameSpace != 'A')
|
||||
return null
|
||||
const data = await fs.readFile( articlePath( article ))
|
||||
|
||||
try {
|
||||
const dom = cheerio.load( data )
|
||||
return dom
|
||||
} catch ( e ) {
|
||||
log( 'cheerio.load error', e, data )
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
var nameSpaces = ['-', 'A', 'B', 'I', 'J', 'M', 'U', 'W', 'X'];
|
||||
|
||||
function alterLinks( article, dom ) {
|
||||
var nameSpaceLink = function (elem, attr) {
|
||||
let link
|
||||
try {
|
||||
link = url.parse(elem.attribs[attr], true, true)
|
||||
} catch (err) {
|
||||
//~ console.error('alterLinks error', err, article, attr, elem.attribs[attr], elem)
|
||||
console.error('alterLinks', err.message, elem.attribs[attr], 'at', article.path)
|
||||
return
|
||||
}
|
||||
if ( (link.protocol && link.protocol != 'http:' && link.protocol != 'https:')
|
||||
|| link.host || ! link.pathname)
|
||||
return
|
||||
|
||||
var chunks = link.pathname.split('/')
|
||||
|
||||
if ( chunks[0] == '' // abs path
|
||||
|| chunks[0] == '..'
|
||||
&& nameSpaces.indexOf(chunks[1]) != -1) {
|
||||
chunks.shift();
|
||||
chunks.shift();
|
||||
link.pathname = chunks.join('/');
|
||||
//~ log('alterLinks', elem.attribs[attr], url.format(link));
|
||||
elem.attribs[attr] = url.format(link);
|
||||
return // OK
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
dom( '[src]' ).each( (i, elem) => nameSpaceLink( elem, 'src' ))
|
||||
dom( '[href]' ).each( (i, elem) => nameSpaceLink( elem, 'href' ))
|
||||
}
|
||||
|
||||
async function processArticle ( articleIndex ) {
|
||||
if ( articles[ articleIndex ] != null )
|
||||
return true // already processed
|
||||
|
||||
const article = {
|
||||
index: articleIndex,
|
||||
offset: readUInt64LE( rawDirectory, articleIndex * 8 )
|
||||
}
|
||||
|
||||
await getDirEntry( article )
|
||||
|
||||
if ( article.mimeIdx == 0xfffe || article.mimeIdx == 0xfffd ) {
|
||||
// linktarget or deleted entry
|
||||
return true // noop
|
||||
}
|
||||
if ( article.mimeIdx == 0xffff ) { //redirect
|
||||
return storeRedirect( article )
|
||||
}
|
||||
|
||||
const moved = await renameBlob( article )
|
||||
if (! moved )
|
||||
return null
|
||||
const dom = await loadArticle( article )
|
||||
if (! dom )
|
||||
return null
|
||||
await alterLinks( article, dom )
|
||||
return fs.outputFile( articlePath( article ), Buffer.from( dom.html() ))
|
||||
}
|
||||
|
||||
var rawDirectory
|
||||
|
||||
async function processArticleList () {
|
||||
log('reading ArticleList')
|
||||
articles = Array( header.articleCount )
|
||||
rawDirectory = await src.read(header.articleCount * 8, header.urlPtrPos )
|
||||
|
||||
//~ log( 'articleOffsets', articleOffsets);
|
||||
|
||||
for ( let i=0; i < header.articleCount; i++ ) {
|
||||
await processArticle( i )
|
||||
}
|
||||
log( '*** articles' )
|
||||
articles.forEach( (val, i ) => log( i, val.nameSpace, val.url ))
|
||||
|
||||
if ( redirectOut )
|
||||
return new Promise( ( resolve, reject ) => {
|
||||
redirectOut.end( resolve )
|
||||
})
|
||||
}
|
||||
|
||||
async function processTitleList () {
|
||||
log('reading Title List')
|
||||
const titleDirectory = await src.read( header.articleCount * 4, header.titlePtrPos )
|
||||
|
||||
//~ log( 'articleOffsets', articleOffsets);
|
||||
log( '*** titles' )
|
||||
|
||||
for ( let i=0; i < header.articleCount; i++ ) {
|
||||
const idx = titleDirectory.readUInt32LE( i * 4 )
|
||||
log( i, idx, articles[ idx ].nameSpace, articles[ idx ].title, '>', articles[ idx ].url )
|
||||
}
|
||||
}
|
||||
|
||||
var redirectOut = null
|
||||
|
||||
function storeRedirect ( article ) {
|
||||
log('storeRedirect', article)
|
||||
|
||||
if (article.nameSpace == '-' && (article.url == 'favicon' || article.url == 'mainPage'))
|
||||
return
|
||||
|
||||
if (! redirectOut) {
|
||||
redirectOut = csvOutput({delimiter: '\t'})
|
||||
redirectOut.pipe(fs.createWriteStream(osPath.join(outPath, '..', 'redirects.csv')))
|
||||
}
|
||||
|
||||
var target = articles[ article.redirectIndex ]
|
||||
if (! target) { // fetch target artcile isn't yet processed
|
||||
return processArticle( article.redirectIndex )
|
||||
.then(() => storeRedirect( article ))
|
||||
}
|
||||
|
||||
var item = [ article.nameSpace, article.url, article.title, target.url ]
|
||||
|
||||
log('storeRedirect', item)
|
||||
|
||||
return new Promise(( resolve, reject ) => {
|
||||
var write = function () {
|
||||
try {
|
||||
if (! redirectOut.write(item))
|
||||
return redirectOut.once('drain', write)
|
||||
resolve( false )
|
||||
} catch ( err ) {
|
||||
reject( err )
|
||||
}
|
||||
}
|
||||
write()
|
||||
})
|
||||
}
|
||||
|
||||
function storeMetadata () {
|
||||
log('storeMetadata');
|
||||
if ( metadata.length == 0 )
|
||||
return
|
||||
|
||||
var csv = csvOutput({ delimiter: ' ' })
|
||||
csv.pipe( fs.createWriteStream( osPath.join( outPath, '..', 'metadata.csv' )))
|
||||
|
||||
return new Promise(( resolve, reject ) => {
|
||||
var write = function () {
|
||||
try {
|
||||
var i = 0;
|
||||
var write = function () {
|
||||
while (true) {
|
||||
if ( i == metadata.length ) {
|
||||
log('storeMetadata finished');
|
||||
return csv.end( resolve );
|
||||
}
|
||||
var item = metadata[i];
|
||||
log('storeMetadata', metadata.length, i, item);
|
||||
if (! csv.write( item ))
|
||||
break;
|
||||
i++
|
||||
}
|
||||
csv.once( 'drain', write )
|
||||
}
|
||||
} catch ( err ) {
|
||||
reject( err )
|
||||
}
|
||||
}
|
||||
write()
|
||||
})
|
||||
}
|
||||
|
||||
async function core () {
|
||||
src = new Reader(srcPath)
|
||||
|
||||
await readHeader( )
|
||||
await processClusterList()
|
||||
await processArticleList()
|
||||
await processTitleList()
|
||||
await storeMetadata()
|
||||
|
||||
await src.close()
|
||||
}
|
||||
|
||||
function main () {
|
||||
command
|
||||
.version( packageInfo.version )
|
||||
.arguments( '<wiki-page-URL>' )
|
||||
.description( 'Dumps a ZIM file' )
|
||||
.option( '-h -help' )
|
||||
.parse( process.argv )
|
||||
|
||||
log( command.opts() )
|
||||
|
||||
srcPath = expandHomeDir( command.args[0] )
|
||||
outPath = expandHomeDir( command.args[1] )
|
||||
if (! outPath ) {
|
||||
var parsed = osPath.parse(srcPath)
|
||||
outPath = parsed.name
|
||||
}
|
||||
|
||||
core()
|
||||
}
|
||||
|
||||
main ()
|
Loading…
Reference in New Issue
Block a user