562 lines
16 KiB
JavaScript
Executable File
562 lines
16 KiB
JavaScript
Executable File
#!/bin/sh
|
|
":" //# -*- mode: js -*-; exec /usr/bin/env node --max-old-space-size=9000 --stack-size=42000 "$0" "$@"
|
|
|
|
"use strict";
|
|
|
|
/************************************/
|
|
/* MODULE VARIABLE SECTION **********/
|
|
/************************************/
|
|
|
|
const os = require('os')
|
|
const osProcess = require('process')
|
|
const osPath = require( 'path' )
|
|
|
|
const expandHomeDir = require( 'expand-home-dir' )
|
|
const fs = require( 'fs-extra' )
|
|
const mimeDb = require( 'mime-db' )
|
|
const mime = require( 'mime-types' )
|
|
|
|
const packageInfo = require('./package.json')
|
|
const genericPool = require( 'generic-pool' )
|
|
const asyncRead = require('promised-read').read
|
|
const cheerio = require('cheerio')
|
|
const command = require('commander')
|
|
|
|
const csvOutput = require('csv-stringify')
|
|
|
|
const moment = require("moment")
|
|
require("moment-duration-format")
|
|
|
|
const startTime = Date.now()
|
|
|
|
function elapsedStr( from , to = Date.now()) {
|
|
return moment.duration( to - from ).format('d[d]hh:mm:ss.SSS',{ stopTrim: "h" })
|
|
}
|
|
|
|
function log ( ...args ) {
|
|
console.log( elapsedStr( startTime ), ... args )
|
|
}
|
|
|
|
function warning ( ...args ) {
|
|
log( ...args )
|
|
}
|
|
|
|
function fatal ( ...args ) {
|
|
log( ...args )
|
|
osProcess.exit( 1 )
|
|
}
|
|
|
|
//~ var lzma = require('lzma-native')
|
|
try {
|
|
var lzma = require('xz')
|
|
} catch (er) {
|
|
if ( os.type() == 'Windows_NT' ) {
|
|
fatal( 'Module "xz" is not available on Windows' )
|
|
} else {
|
|
fatal( 'Module "xz" is required' )
|
|
}
|
|
}
|
|
//~ var lzma = require('node-liblzma')
|
|
|
|
var srcPath;
|
|
var outPath;
|
|
var src; // input file reader
|
|
|
|
var articles = null;
|
|
var metadata = [];
|
|
|
|
function readUInt64LE(buf, offset) {
|
|
var lowBits = buf.readUInt32LE(offset);
|
|
var highBits = buf.readUInt32LE(offset + 4);
|
|
return highBits * 0x100000000 + lowBits
|
|
};
|
|
|
|
function blobPath(clusterIdx, blobIdx) {
|
|
return osPath.join(outPath, clusterIdx + '-' + blobIdx + '-blob');
|
|
}
|
|
|
|
function articlePath(article) {
|
|
return osPath.join(outPath, article.url);
|
|
}
|
|
|
|
//
|
|
// class Reader
|
|
//
|
|
class Reader {
|
|
constructor ( path ) {
|
|
this.path = path;
|
|
this.position = 0;
|
|
this.file = fs.open( path, 'r' )
|
|
|
|
this.queue = genericPool.createPool(
|
|
{
|
|
async create () { return Symbol() },
|
|
async destroy ( resource ) { },
|
|
},
|
|
{}
|
|
)
|
|
}
|
|
|
|
async read ( length, position ) {
|
|
|
|
const token = await this.queue.acquire()
|
|
const fd = await this.file
|
|
|
|
if (typeof position !== 'number')
|
|
position = this.position
|
|
this.position = position + length
|
|
|
|
const data = Buffer.alloc(length)
|
|
const bytes = await fs.read( fd, data, 0, length, position )
|
|
this.queue.release( token )
|
|
return data
|
|
}
|
|
|
|
async close () {
|
|
await this.queue.drain()
|
|
const fd = await this.file
|
|
await fs.close( fd )
|
|
}
|
|
|
|
tell () {
|
|
return this.position
|
|
}
|
|
}
|
|
|
|
var headerLength = 80;
|
|
|
|
var header = {
|
|
magicNumber: 72173914, // integer 0 4 Magic number to recognise the file format, must be 72173914
|
|
version: 5, // integer 4 4 ZIM=5, bytes 1-2: major, bytes 3-4: minor version of the ZIM file format
|
|
uuid: 0, // integer 8 16 unique id of this zim file
|
|
articleCount: 0, // integer 24 4 total number of articles
|
|
clusterCount: 0, // integer 28 4 total number of clusters
|
|
urlPtrPos: 0, // integer 32 8 position of the directory pointerlist ordered by URL
|
|
titlePtrPos: 0, // integer 40 8 position of the directory pointerlist ordered by Title
|
|
clusterPtrPos: 0, // integer 48 8 position of the cluster pointer list
|
|
mimeListPos: headerLength, // integer 56 8 position of the MIME type list (also header size)
|
|
mainPage: 0xffffffff, // integer 64 4 main page or 0xffffffff if no main page
|
|
layoutPage: 0xffffffff, // integer 68 4 layout page or 0xffffffffff if no layout page
|
|
checksumPos: 0, // integer 72 8 pointer to the md5checksum of this file without the checksum itself. This points always 16 bytes before the end of the file.
|
|
geoIndexPos: 0, // integer 80 8 pointer to the geo index (optional). Present if mimeListPos is at least 80.
|
|
};
|
|
|
|
async function readHeader ( ) {
|
|
log('reading header')
|
|
const buf = await src.read( headerLength, 0 )
|
|
|
|
header.articleCount = buf.readUInt32LE(24);
|
|
header.clusterCount = buf.readUInt32LE(28);
|
|
|
|
header.urlPtrPos = readUInt64LE(buf, 32);
|
|
header.titlePtrPos = readUInt64LE(buf, 40);
|
|
header.clusterPtrPos = readUInt64LE(buf, 48);
|
|
header.mimeListPos = readUInt64LE(buf, 56);
|
|
|
|
header.mainPage = buf.readUInt32LE(64);
|
|
header.layoutPage = buf.readUInt32LE(68);
|
|
|
|
log('header', header);
|
|
}
|
|
|
|
async function processClusterList ( ) {
|
|
log('reading ClusterPointers')
|
|
const buf = await src.read( header.clusterCount * 8, header.clusterPtrPos )
|
|
|
|
try {
|
|
for ( let i=0; i < header.clusterCount; i++ ) {
|
|
await processCluster( buf, i )
|
|
}
|
|
} catch ( err ) {
|
|
fatal( 'processClusterList', err )
|
|
}
|
|
};
|
|
|
|
async function processCluster( buf, clusterIdx ) {
|
|
var eof = false;
|
|
|
|
const clusterOfs = readUInt64LE( buf, clusterIdx * 8 )
|
|
|
|
async function readCompression () {
|
|
const buf = await src.read( 1, clusterOfs )
|
|
|
|
return buf.readUInt8(0) & 4; // xz compressed
|
|
}
|
|
|
|
async function getSource( isCompressed ) {
|
|
var slice = fs.createReadStream(
|
|
src.path,
|
|
{
|
|
start: clusterOfs + 1,
|
|
// autoClose: false,
|
|
}
|
|
);
|
|
|
|
slice.on('error', function (err) {
|
|
console.error('processCluster', clusterIdx, 'input error', err);
|
|
//~ process.exit(1);
|
|
});
|
|
|
|
slice.on('end', function () {
|
|
log('processCluster', clusterIdx, 'input end');
|
|
eof = true;
|
|
//~ process.exit(1);
|
|
});
|
|
|
|
slice.on('close', function () {
|
|
log('processCluster', clusterIdx, 'input closed');
|
|
eof = true;
|
|
//~ process.exit(1);
|
|
});
|
|
|
|
slice.on('open', function (fd) {
|
|
log('processCluster', clusterIdx, 'input open', fd);
|
|
});
|
|
|
|
if ( isCompressed ) { // xz compressed
|
|
const decompressed = new lzma.Decompressor()
|
|
slice.pipe( decompressed )
|
|
return decompressed
|
|
}
|
|
return slice
|
|
}
|
|
|
|
async function readOffsets ( input ) {
|
|
const offsets = []
|
|
let noffsets
|
|
for ( var buf; buf = await asyncRead( input, 4 );) {
|
|
var ofs = buf.readUInt32LE( 0 )
|
|
if ( offsets.length == 0 ) {
|
|
noffsets = ofs / 4
|
|
}
|
|
//~ log('readOffsets', clusterIdx, noffsets, offsets.length, ofs);
|
|
offsets.push(ofs)
|
|
|
|
if ( offsets.length == noffsets ) {
|
|
//~ log('readOffsets done', clusterIdx, noffsets, offsets.length, ofs);
|
|
return offsets
|
|
}
|
|
}
|
|
fatal( 'readOffsets prematire stream end' )
|
|
}
|
|
|
|
async function dumpBlobs ( input, offsets ) {
|
|
for ( let i=0; i < offsets.length-1; i++ ) {
|
|
|
|
const blobLen = offsets[ i + 1 ] - offsets[ i ]
|
|
const blob = blobLen === 0 ?
|
|
Buffer.alloc(0)
|
|
: await asyncRead( input, blobLen )
|
|
await fs.outputFile( blobPath( clusterIdx, i ), blob )
|
|
|
|
//~ log('readBlobs', clusterIdx, isCompressed, nblobs, i, blobLen)
|
|
}
|
|
|
|
//~ log('readBlobs done', clusterIdx, isCompressed, nblobs, blobIdx, blobLen)
|
|
}
|
|
|
|
let input
|
|
|
|
try {
|
|
const isCompressed = await readCompression()
|
|
log('processCluster', clusterIdx, header.clusterCount, isCompressed);
|
|
|
|
input = await getSource( isCompressed )
|
|
const offsets = await readOffsets( input )
|
|
await dumpBlobs( input, offsets )
|
|
} catch ( err ) {
|
|
if (!eof) {
|
|
//~ slice.fd = null;
|
|
input && input.destroy()
|
|
}
|
|
fatal( 'processCluster error', clusterIdx, header.clusterCount, err )
|
|
}
|
|
}
|
|
|
|
async function getDirEntry ( article ) {
|
|
let chunkLen = 512;
|
|
let dirEntry
|
|
|
|
function parseDirEntry () {
|
|
article.mimeIdx = dirEntry.readUInt16LE(0);
|
|
article.nameSpace = dirEntry.toString('utf8', 3, 4);
|
|
|
|
var strOfs = 16;
|
|
if (article.mimeIdx == 0xfffe || article.mimeIdx == 0xfffd) {
|
|
// linktarget or deleted entry
|
|
return true // noop
|
|
} else if (article.mimeIdx == 0xffff ) { //redirect
|
|
strOfs = 12;
|
|
article.redirectIndex = dirEntry.readUInt32LE(8);
|
|
} else {
|
|
article.clusterIdx = dirEntry.readUInt32LE(8);
|
|
article.blobIdx = dirEntry.readUInt32LE(12);
|
|
}
|
|
|
|
// read url and title
|
|
var end = dirEntry.indexOf(0, strOfs);
|
|
if (end != -1) {
|
|
article.url = dirEntry.toString('utf8', strOfs, end);
|
|
|
|
var strOfs = end + 1;
|
|
end = dirEntry.indexOf(0, strOfs);
|
|
if (end != -1) {
|
|
article.title = dirEntry.toString('utf8', strOfs, end);
|
|
}
|
|
}
|
|
|
|
if (end == -1) // short buffer -- read more
|
|
return false
|
|
|
|
log('parseDirEntry', article.index, header.articleCount, '\n', article);
|
|
|
|
articles[article.index] = article
|
|
|
|
return true
|
|
}
|
|
|
|
try {
|
|
while ( true ) {
|
|
dirEntry = await src.read( chunkLen, article.offset )
|
|
if ( parseDirEntry() )
|
|
return article
|
|
chunkLen *= 2
|
|
}
|
|
} catch ( err ) {
|
|
fatal( 'processdirEntry read error', article.index, header.articleCount, err )
|
|
}
|
|
}
|
|
|
|
async function renameBlob( article ) {
|
|
|
|
var bpath = blobPath(article.clusterIdx, article.blobIdx)
|
|
|
|
if (article.nameSpace == 'M') { // metadata
|
|
const data = await fs.readFile ( bpath, 'utf8' )
|
|
metadata.push([article.url.toLowerCase(), data])
|
|
return fs.unlink( bpath )
|
|
}
|
|
const apath = articlePath( article )
|
|
|
|
log('renameBlob', article.index, header.articleCount, bpath, '->', apath )
|
|
|
|
return fs.move( bpath, apath, { clobber: true })
|
|
}
|
|
|
|
async function loadArticle( article ) {
|
|
if (article.nameSpace != 'A')
|
|
return null
|
|
const data = await fs.readFile( articlePath( article ))
|
|
|
|
try {
|
|
const dom = cheerio.load( data )
|
|
return dom
|
|
} catch ( e ) {
|
|
log( 'cheerio.load error', e, data )
|
|
return null
|
|
}
|
|
}
|
|
|
|
var nameSpaces = ['-', 'A', 'B', 'I', 'J', 'M', 'U', 'W', 'X'];
|
|
|
|
function alterLinks( article, dom ) {
|
|
var nameSpaceLink = function (elem, attr) {
|
|
let link
|
|
try {
|
|
link = url.parse(elem.attribs[attr], true, true)
|
|
} catch (err) {
|
|
//~ console.error('alterLinks error', err, article, attr, elem.attribs[attr], elem)
|
|
console.error('alterLinks', err.message, elem.attribs[attr], 'at', article.path)
|
|
return
|
|
}
|
|
if ( (link.protocol && link.protocol != 'http:' && link.protocol != 'https:')
|
|
|| link.host || ! link.pathname)
|
|
return
|
|
|
|
var chunks = link.pathname.split('/')
|
|
|
|
if ( chunks[0] == '' // abs path
|
|
|| chunks[0] == '..'
|
|
&& nameSpaces.indexOf(chunks[1]) != -1) {
|
|
chunks.shift();
|
|
chunks.shift();
|
|
link.pathname = chunks.join('/');
|
|
//~ log('alterLinks', elem.attribs[attr], url.format(link));
|
|
elem.attribs[attr] = url.format(link);
|
|
return // OK
|
|
}
|
|
return
|
|
}
|
|
|
|
dom( '[src]' ).each( (i, elem) => nameSpaceLink( elem, 'src' ))
|
|
dom( '[href]' ).each( (i, elem) => nameSpaceLink( elem, 'href' ))
|
|
}
|
|
|
|
async function processArticle ( articleIndex ) {
|
|
if ( articles[ articleIndex ] != null )
|
|
return true // already processed
|
|
|
|
const article = {
|
|
index: articleIndex,
|
|
offset: readUInt64LE( rawDirectory, articleIndex * 8 )
|
|
}
|
|
|
|
await getDirEntry( article )
|
|
|
|
if ( article.mimeIdx == 0xfffe || article.mimeIdx == 0xfffd ) {
|
|
// linktarget or deleted entry
|
|
return true // noop
|
|
}
|
|
if ( article.mimeIdx == 0xffff ) { //redirect
|
|
return storeRedirect( article )
|
|
}
|
|
|
|
const moved = await renameBlob( article )
|
|
if (! moved )
|
|
return null
|
|
const dom = await loadArticle( article )
|
|
if (! dom )
|
|
return null
|
|
await alterLinks( article, dom )
|
|
return fs.outputFile( articlePath( article ), Buffer.from( dom.html() ))
|
|
}
|
|
|
|
var rawDirectory
|
|
|
|
async function processArticleList () {
|
|
log('reading ArticleList')
|
|
articles = Array( header.articleCount )
|
|
rawDirectory = await src.read(header.articleCount * 8, header.urlPtrPos )
|
|
|
|
//~ log( 'articleOffsets', articleOffsets);
|
|
|
|
for ( let i=0; i < header.articleCount; i++ ) {
|
|
await processArticle( i )
|
|
}
|
|
log( '*** articles' )
|
|
articles.forEach( (val, i ) => log( i, val.nameSpace, val.url ))
|
|
|
|
if ( redirectOut )
|
|
return new Promise( ( resolve, reject ) => {
|
|
redirectOut.end( resolve )
|
|
})
|
|
}
|
|
|
|
async function processTitleList () {
|
|
log('reading Title List')
|
|
const titleDirectory = await src.read( header.articleCount * 4, header.titlePtrPos )
|
|
|
|
//~ log( 'articleOffsets', articleOffsets);
|
|
log( '*** titles' )
|
|
|
|
for ( let i=0; i < header.articleCount; i++ ) {
|
|
const idx = titleDirectory.readUInt32LE( i * 4 )
|
|
log( i, idx, articles[ idx ].nameSpace, articles[ idx ].title, '>', articles[ idx ].url )
|
|
}
|
|
}
|
|
|
|
var redirectOut = null
|
|
|
|
function storeRedirect ( article ) {
|
|
log('storeRedirect', article)
|
|
|
|
if (article.nameSpace == '-' && (article.url == 'favicon' || article.url == 'mainPage'))
|
|
return
|
|
|
|
if (! redirectOut) {
|
|
redirectOut = csvOutput({delimiter: '\t'})
|
|
redirectOut.pipe(fs.createWriteStream(osPath.join(outPath, '..', 'redirects.csv')))
|
|
}
|
|
|
|
var target = articles[ article.redirectIndex ]
|
|
if (! target) { // fetch target artcile isn't yet processed
|
|
return processArticle( article.redirectIndex )
|
|
.then(() => storeRedirect( article ))
|
|
}
|
|
|
|
var item = [ article.nameSpace, article.url, article.title, target.url ]
|
|
|
|
log('storeRedirect', item)
|
|
|
|
return new Promise(( resolve, reject ) => {
|
|
var write = function () {
|
|
try {
|
|
if (! redirectOut.write(item))
|
|
return redirectOut.once('drain', write)
|
|
resolve( false )
|
|
} catch ( err ) {
|
|
reject( err )
|
|
}
|
|
}
|
|
write()
|
|
})
|
|
}
|
|
|
|
function storeMetadata () {
|
|
log('storeMetadata');
|
|
if ( metadata.length == 0 )
|
|
return
|
|
|
|
var csv = csvOutput({ delimiter: ' ' })
|
|
csv.pipe( fs.createWriteStream( osPath.join( outPath, '..', 'metadata.csv' )))
|
|
|
|
return new Promise(( resolve, reject ) => {
|
|
var write = function () {
|
|
try {
|
|
var i = 0;
|
|
var write = function () {
|
|
while (true) {
|
|
if ( i == metadata.length ) {
|
|
log('storeMetadata finished');
|
|
return csv.end( resolve );
|
|
}
|
|
var item = metadata[i];
|
|
log('storeMetadata', metadata.length, i, item);
|
|
if (! csv.write( item ))
|
|
break;
|
|
i++
|
|
}
|
|
csv.once( 'drain', write )
|
|
}
|
|
} catch ( err ) {
|
|
reject( err )
|
|
}
|
|
}
|
|
write()
|
|
})
|
|
}
|
|
|
|
async function core () {
|
|
src = new Reader(srcPath)
|
|
|
|
await readHeader( )
|
|
await processClusterList()
|
|
await processArticleList()
|
|
await processTitleList()
|
|
await storeMetadata()
|
|
|
|
await src.close()
|
|
}
|
|
|
|
function main () {
|
|
command
|
|
.version( packageInfo.version )
|
|
.arguments( '<wiki-page-URL>' )
|
|
.description( 'Dumps a ZIM file' )
|
|
.option( '-h -help' )
|
|
.parse( process.argv )
|
|
|
|
log( command.opts() )
|
|
|
|
srcPath = expandHomeDir( command.args[0] )
|
|
outPath = expandHomeDir( command.args[1] )
|
|
if (! outPath ) {
|
|
var parsed = osPath.parse(srcPath)
|
|
outPath = parsed.name
|
|
}
|
|
|
|
core()
|
|
}
|
|
|
|
main ()
|