1120 lines
33 KiB
JavaScript
Executable File
1120 lines
33 KiB
JavaScript
Executable File
#!/bin/sh
|
|
":" //# -*- mode: js -*-; exec /usr/bin/env TMPDIR=/tmp node --max-old-space-size=2000 --stack-size=42000 "$0" "$@"
|
|
|
|
// rm -r ru.wikivoyage.org ; time ./wikizimmer.js https://ru.wikivoyage.org/wiki/Суздаль 2>&1 | tee zz.log
|
|
// rm -r ru.wikivoyage.org ; node --inspect --debug-brk ./wikizimmer.js https://ru.wikivoyage.org/wiki/Суздаль 2>&1 | tee zz.log
|
|
|
|
"use strict"
|
|
|
|
/*
|
|
|
|
MIT License
|
|
|
|
Copyright (c) 2017 Vadim Shlyakhov
|
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
of this software and associated documentation files (the "Software"), to deal
|
|
in the Software without restriction, including without limitation the rights
|
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
copies of the Software, and to permit persons to whom the Software is
|
|
furnished to do so, subject to the following conditions:
|
|
|
|
The above copyright notice and this permission notice shall be included in all
|
|
copies or substantial portions of the Software.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
SOFTWARE.
|
|
|
|
*/
|
|
|
|
const packageInfo = require('./package.json');
|
|
const os = require('os')
|
|
const osProcess = require('process')
|
|
const osPath = require( 'path' )
|
|
const urlconv = require('url')
|
|
const crypto = require("crypto")
|
|
|
|
const argv = require('commander')
|
|
const fs = require('fs-extra')
|
|
const Promise = require('bluebird')
|
|
const requestPromise = require('request-promise')
|
|
const sqlite = require( 'sqlite' )
|
|
const cheerio = require('cheerio')
|
|
|
|
const langs = require('langs')
|
|
const encodeurl = require('encodeurl')
|
|
const iconv = require('iconv-lite')
|
|
const lru = require('quick-lru')
|
|
|
|
const mimeTypes = require( 'mime-types' )
|
|
const mmmagic = require( 'mmmagic' )
|
|
const mimeMagic = new mmmagic.Magic( mmmagic.MAGIC_MIME_TYPE )
|
|
|
|
const cpuCount = os.cpus().length
|
|
|
|
var articleCount = 0
|
|
var redirectCount = 0
|
|
const mimeIds = []
|
|
|
|
var saveCss = true
|
|
var saveImages = true
|
|
var savePages = true
|
|
|
|
var http
|
|
|
|
var indexerDb
|
|
|
|
// https://msdn.microsoft.com/en-us/library/windows/desktop/aa365247
|
|
// just in case https://www.mediawiki.org/wiki/Manual:Page_title
|
|
const sanitizeRE = /(?:[\x00-\x1F<>:"\\\?\*]|%(?:[^0-9A-Fa-f]|[0-9A-Fa-f][^0-9A-Fa-f])|(?:[. ]$))+/g
|
|
|
|
function sanitizeFN ( name ) { // after https://github.com/pillarjs/encodeurl
|
|
return String( name ).replace( sanitizeRE, encodeURIComponent )
|
|
//~ return sanitizeFilename( name, { replacement: '.' })
|
|
}
|
|
|
|
function log ( arg ) {
|
|
console.log.apply( this, arguments )
|
|
//~argv && argv.verbose && console.log.apply(this, arguments)
|
|
}
|
|
|
|
function fatal (arg) {
|
|
console.error.apply( this, arguments )
|
|
//~argv && argv.verbose && console.log.apply(this, arguments)
|
|
osProcess.exit( 1 )
|
|
}
|
|
|
|
function mimeFromData ( data ) {
|
|
return new Promise(( resolve, reject ) =>
|
|
mimeMagic.detect( data, ( error, mimeType ) => {
|
|
if ( error )
|
|
return reject( error )
|
|
return resolve( mimeType )
|
|
})
|
|
)
|
|
}
|
|
|
|
const UserAgent = `wikizimmer/${packageInfo.version} (https://github.com/vadp/zimmer email:vadp.devl@gmail.com)`
|
|
|
|
function pooledRequest( request, referenceUri, maxTokens, initialInterval ) {
|
|
const retryErrorCodes = [ 'EPROTO', 'ECONNRESET', 'ESOCKETTIMEDOUT' ]
|
|
const retryStatusCodes = [ 503 ]
|
|
const retryLimit = 10
|
|
const requestTimeout = 5 * 60 * 1000
|
|
const queue = []
|
|
let timer = null
|
|
let supressTimer = false
|
|
let supressTimeout = 60000
|
|
let tokenCounter = 0
|
|
let interval = initialInterval
|
|
|
|
function setTimer () {
|
|
if ( supressTimer )
|
|
return
|
|
timer = setTimeout(
|
|
() => ( timer = null, schedule() ),
|
|
interval
|
|
)
|
|
}
|
|
|
|
function pause () {
|
|
if ( timer )
|
|
clearTimeout( timer )
|
|
if ( ! supressTimer ) {
|
|
supressTimer = setTimeout(
|
|
() => ( supressTimer = false, setTimer()),
|
|
supressTimeout
|
|
)
|
|
interval = interval * 2
|
|
supressTimeout = supressTimeout * 4
|
|
}
|
|
}
|
|
|
|
function release () {
|
|
tokenCounter --
|
|
schedule()
|
|
}
|
|
|
|
function retry ( query, error ) {
|
|
pause ()
|
|
if ( ++ query.retries <= retryLimit ) {
|
|
queue.push( query )
|
|
return
|
|
}
|
|
query.reject( error )
|
|
return
|
|
}
|
|
|
|
function acquire () {
|
|
let query
|
|
if ( timer || tokenCounter >= maxTokens || ! ( query = queue.shift()))
|
|
return false
|
|
tokenCounter ++
|
|
return {
|
|
query,
|
|
release,
|
|
retry,
|
|
}
|
|
}
|
|
|
|
function schedule () {
|
|
let token = acquire()
|
|
if ( ! token )
|
|
return
|
|
runRequest( token )
|
|
setTimer()
|
|
}
|
|
|
|
function runRequest( token ) {
|
|
let query = token.query
|
|
return request( query )
|
|
.catch( error => {
|
|
if ( error.statusCode == 404 ) {
|
|
query.reject( error )
|
|
return
|
|
}
|
|
if ( retryStatusCodes.includes( error.statusCode )) {
|
|
console.error( 'request', error.name, error.statusCode, query )
|
|
token.retry( query, error )
|
|
return
|
|
}
|
|
if ( error.cause && retryErrorCodes.includes( error.cause.code )) { // https error?
|
|
console.error( 'request', error.name, error.cause.code, query )
|
|
token.retry( query, error )
|
|
return
|
|
}
|
|
fatal( 'request fatal error', error, query )
|
|
return
|
|
})
|
|
.then( reply => {
|
|
token.release()
|
|
return reply ? query.resolve( reply ) : null
|
|
})
|
|
}
|
|
|
|
function append ( query, priority ) {
|
|
return new Promise(( resolve, reject ) => {
|
|
query.resolve = resolve
|
|
query.reject = reject
|
|
query.retries = 0
|
|
|
|
if ( priority )
|
|
queue.unshift( query )
|
|
else
|
|
queue.push( query )
|
|
|
|
schedule()
|
|
})
|
|
}
|
|
|
|
function processOptions ( query ) {
|
|
let url
|
|
if ( typeof query === 'string' || query.href !== undefined ) {
|
|
// string or URL object
|
|
url = query
|
|
query = {}
|
|
} else {
|
|
url = query.uri || query.url
|
|
query.uri = null
|
|
}
|
|
query.url = urlconv.resolve( referenceUri, url )
|
|
if (! query.headers)
|
|
query.headers = {}
|
|
query.headers[ 'User-Agent' ] = UserAgent
|
|
query.resolveWithFullResponse = true
|
|
query.timeout = requestTimeout
|
|
query.forever = true
|
|
|
|
log('-', decodeURI( query.url ), query.qs || '')
|
|
|
|
return query
|
|
}
|
|
|
|
return function ( query, priority ) {
|
|
return append( processOptions( query ), priority )
|
|
}
|
|
}
|
|
|
|
function api ( params, options = {} ) {
|
|
if ( options.method == 'POST' && options.form )
|
|
options.form.format = 'json'
|
|
else
|
|
params.format = 'json'
|
|
Object.assign( options, {
|
|
url: wiki.apiUrl,
|
|
qs: params,
|
|
})
|
|
|
|
return http( options )
|
|
.then( reply => {
|
|
const res = JSON.parse( reply.body )
|
|
return res.error || res.warning ? Promise.reject( res.error || res.warning ) : res
|
|
})
|
|
}
|
|
|
|
function apiPost( params ) {
|
|
return api( null, {
|
|
method: 'POST',
|
|
form: params,
|
|
})
|
|
}
|
|
|
|
const wiki = {
|
|
saveDir: null,
|
|
apiUrl: null,
|
|
metadata: {},
|
|
}
|
|
|
|
class WikiItem {
|
|
constructor ( zimNameSpace, url, title ) {
|
|
this.encoding = null
|
|
this.revision = 0
|
|
this.id = null
|
|
this.loadPriority = false
|
|
Object.assign( this, { zimNameSpace, url, title })
|
|
}
|
|
|
|
data () {
|
|
return ( this.data_ !== undefined ? Promise.resolve( this.data_ ) : ( this.data_ = this.load( )))
|
|
.then( data => ! Buffer.isBuffer( data ) || this.encoding == null
|
|
? data
|
|
: iconv.decode( data, this.encoding )
|
|
)
|
|
}
|
|
|
|
|
|
load () {
|
|
return http({
|
|
url: this.url,
|
|
encoding: null,
|
|
},
|
|
this.loadPriority
|
|
)
|
|
.catch( err => {
|
|
if ( err.statusCode == 404 ) {
|
|
return Promise.reject( new Error( `Load error ${err.statusCode} ${err.options.uri || err.options.url}` ))
|
|
}
|
|
fatal( 'Load error', err.statusCode, err.options.uri || err.options.url )
|
|
return Promise.reject( new Error( 'Load error' ))
|
|
})
|
|
.then( resp => {
|
|
const data = resp.body
|
|
|
|
this.url = resp.request.href // possibly redirected
|
|
this.headers = resp.headers
|
|
if ( ! this.revision ) {
|
|
const modified = this.headers[ 'last-modified' ] // "Tue, 27 Jun 2017 14:37:49 GMT"
|
|
const dateBasedRevision = Math.round(( Date.parse( modified ) - Date.parse( '2000-01-01' )) / 1000 )
|
|
this.revision = dateBasedRevision
|
|
}
|
|
|
|
const contentType = resp.headers[ "content-type" ]
|
|
let csplit = contentType.split( ';' )
|
|
this.mimeType = csplit[ 0 ]
|
|
|
|
if ( this.mimeType.split( '/' )[ 0 ] == 'text' ) {
|
|
this.encoding = 'utf-8'
|
|
if ( csplit.length > 1 && csplit[ 1 ].includes( 'charset=' )) {
|
|
this.encoding = csplit[ 1 ].split( '=' )[ 1 ]
|
|
}
|
|
}
|
|
|
|
if ( this.mimeType == 'application/x-www-form-urlencoded' ) {
|
|
return mimeFromData( data )
|
|
.then( mimeType => {
|
|
this.mimeType = mimeType
|
|
return data
|
|
})
|
|
.catch( err => data )
|
|
}
|
|
|
|
return data
|
|
})
|
|
}
|
|
|
|
baseName () {
|
|
const urlp = urlconv.parse( this.url )
|
|
const pathp = osPath.parse( urlp.pathname )
|
|
return sanitizeFN( decodeURIComponent( pathp.base ))
|
|
}
|
|
|
|
localPath () {
|
|
return '/' + this.zimNameSpace + '/' + this.baseName()
|
|
}
|
|
|
|
urlKey () {
|
|
return this.zimNameSpace + this.baseName()
|
|
}
|
|
|
|
titleKey () {
|
|
return this.title ? this.zimNameSpace + this.title : this.urlKey()
|
|
}
|
|
|
|
mimeId () {
|
|
if ( this.mimeType == null )
|
|
fatal( 'this.mimeType == null', this )
|
|
let id = mimeIds.indexOf( this.mimeType )
|
|
if ( id == -1 ) {
|
|
id = mimeIds.length
|
|
mimeIds.push( this.mimeType )
|
|
}
|
|
return id
|
|
}
|
|
|
|
store ( data ) {
|
|
if ( data == null )
|
|
return Promise.reject( new Error( 'data == null' ))
|
|
|
|
const savePath = wiki.saveDir + this.localPath()
|
|
log( '>', savePath )
|
|
|
|
return fs.outputFile( savePath, data )
|
|
.then( () => this.localPath() )
|
|
}
|
|
|
|
storeMetadata ( ) {
|
|
const row = [
|
|
this.urlKey(),
|
|
this.titleKey(),
|
|
this.revision,
|
|
this.mimeId(),
|
|
]
|
|
return indexerDb.run(
|
|
'INSERT INTO articles ( urlKey, titleKey, revision, mimeId ) VALUES ( ?,?,?,? )',
|
|
row
|
|
)
|
|
.then( res => {
|
|
//~ log( 'storeMetadata res', res )
|
|
this.id = res.stmt.lastID
|
|
++ articleCount
|
|
return this.id
|
|
})
|
|
}
|
|
|
|
process () {
|
|
return Promise.resolve()
|
|
.then( () => this.data())
|
|
.then( data => this.store( data ))
|
|
.then( () => this.storeMetadata() )
|
|
.then( () => this.localPath() )
|
|
.catch( err => {
|
|
console.warn( 'Save error', err.name, err.message, this.url, '->', this.localPath())
|
|
return ''
|
|
})
|
|
}
|
|
}
|
|
|
|
// {
|
|
// "pageid": 10,
|
|
// "ns": 0,
|
|
// "title": "Baltic Sea",
|
|
// "touched": "2017-06-27T14:37:49Z",
|
|
// "lastrevid": 168879,
|
|
// "counter": 62340,
|
|
// "length": 9324,
|
|
// "fullurl": "http:\/\/www.cruiserswiki.org\/wiki\/Baltic_Sea",
|
|
// "editurl": "http:\/\/www.cruiserswiki.org\/index.php?title=Baltic_Sea&action=edit"
|
|
// }
|
|
// {
|
|
// "ns": 0,
|
|
// "title": "Anchorages of Lesvos Island",
|
|
// "missing": "",
|
|
// "fullurl": "http:\/\/www.cruiserswiki.org\/wiki\/Anchorages_of_Lesvos_Island",
|
|
// "editurl": "http:\/\/www.cruiserswiki.org\/index.php?title=Anchorages_of_Lesvos_Island&action=edit"
|
|
// }
|
|
class ArticleStub extends WikiItem {
|
|
constructor ( pageInfo ) {
|
|
super( 'A', urlconv.resolve( wiki.articleBase, pageInfo.fullurl ), pageInfo.title )
|
|
this.info = pageInfo
|
|
this.mwId = pageInfo.pageid
|
|
this.revision = pageInfo.lastrevid
|
|
}
|
|
|
|
baseName () {
|
|
if ( this.url && this.url.startsWith( wiki.articleBase )) {
|
|
const urlParsed = urlconv.parse( this.url )
|
|
const subPath = urlParsed.pathname.replace( wiki.articlePath, '' )
|
|
return sanitizeFN( decodeURIComponent( subPath )) + '.html'
|
|
}
|
|
return null // not a local article
|
|
}
|
|
}
|
|
|
|
class Article extends ArticleStub {
|
|
constructor ( pageInfo ) {
|
|
super( pageInfo )
|
|
this.basePath = '../'.repeat( this.baseName().split( '/' ).length - 1 )
|
|
}
|
|
|
|
load () {
|
|
return super.load()
|
|
.then( body => this.preProcess( body ))
|
|
}
|
|
|
|
preProcess( data, reply ) {
|
|
let src
|
|
try {
|
|
src = cheerio.load( data )
|
|
} catch ( e ) {
|
|
log( 'cheerio.load error', e, data, reply )
|
|
return data
|
|
}
|
|
const content = src( '#bodyContent' )
|
|
const dom = cheerio.load( wiki.pageTemplate )
|
|
dom( 'title' ).text( this.title )
|
|
|
|
dom( '#bodyContent' ).replaceWith( content )
|
|
|
|
// modify links
|
|
let css = dom( '#layout-css' )
|
|
css.attr( 'href', this.basePath + css.attr( 'href' ))
|
|
|
|
dom( 'a' ).toArray().map( elem => {
|
|
this.transformGeoLink( elem )
|
|
this.transformLink( elem )
|
|
})
|
|
// map area links
|
|
dom( 'area' ).toArray().map( elem => {
|
|
this.transformLink( elem )
|
|
})
|
|
|
|
// remove comments
|
|
dom( '*' ).contents().each( (i, elem) => {
|
|
//~ log( 'comment', elem.type )
|
|
if ( elem.type === 'comment' ) {
|
|
dom( elem ).remove()
|
|
}
|
|
})
|
|
|
|
return Promise.all( dom( 'img' ).toArray().map(
|
|
elem => this.saveImage( elem )
|
|
))
|
|
.then ( () => {
|
|
this.mimeType = 'text/html'
|
|
this.encoding = 'utf-8'
|
|
const out = dom.html()
|
|
return out
|
|
})
|
|
.catch( err => {
|
|
log( err )
|
|
})
|
|
}
|
|
|
|
transformLink( elem ) {
|
|
const url = elem.attribs.href
|
|
if (! url || url.startsWith( '#' ))
|
|
return
|
|
|
|
if ( url.includes( 'action=edit' )) {
|
|
delete elem.attribs.href
|
|
return
|
|
}
|
|
const link = new ArticleStub({ fullurl: url })
|
|
|
|
const path = urlconv.parse( link.url ).pathname
|
|
if ( ! path || path == '/' )
|
|
return
|
|
|
|
const baseName = link.baseName()
|
|
if ( baseName != null ) { // local article link
|
|
if ( path.includes( ':' )) {
|
|
delete elem.attribs.href // block other name spaces
|
|
} else {
|
|
elem.attribs.href = this.basePath + baseName
|
|
}
|
|
}
|
|
const pathlc = path.toLowerCase()
|
|
for ( const ext of [ '.jpg', '.jpeg', '.png', '.gif', '.svg' ]) {
|
|
if (pathlc.endsWith( ext )) {
|
|
delete elem.attribs.href // block links to images
|
|
}
|
|
}
|
|
}
|
|
|
|
transformGeoLink( elem ) {
|
|
const lat = elem.attribs[ "data-lat" ]
|
|
const lon = elem.attribs[ "data-lon" ]
|
|
if ( lat == null || lon == null )
|
|
return
|
|
|
|
elem.attribs.href = `geo:${lat},${lon}`
|
|
}
|
|
|
|
saveImage ( elem ) {
|
|
delete elem.attribs.srcset
|
|
let url = elem.attribs.src
|
|
if (! url || url.startsWith( 'data:' ))
|
|
return url
|
|
const image = new Image( url )
|
|
return image.process()
|
|
.then( localPath => {
|
|
elem.attribs.src = encodeURI( this.basePath + '..' + localPath )
|
|
})
|
|
}
|
|
}
|
|
|
|
class Redirect extends ArticleStub {
|
|
|
|
constructor ( info ) {
|
|
super( info )
|
|
this.to = info.to
|
|
this.toFragment = info.toFragment
|
|
}
|
|
|
|
data() {
|
|
return null
|
|
}
|
|
|
|
mimeId () {
|
|
return 0xffff
|
|
}
|
|
|
|
store () {
|
|
return null
|
|
}
|
|
|
|
storeMetadata ( ) {
|
|
return super.storeMetadata()
|
|
.then( () => {
|
|
const target = new ArticleStub( this.to )
|
|
const row = [
|
|
this.id,
|
|
target.urlKey(),
|
|
this.toFragment,
|
|
]
|
|
|
|
log( '-->', this.title || this.url, row)
|
|
|
|
indexerDb.run(
|
|
'INSERT INTO redirects (id, targetKey, fragment) VALUES (?,?,?)',
|
|
row
|
|
)
|
|
})
|
|
}
|
|
}
|
|
|
|
class MainPage extends Redirect {
|
|
constructor ( ) {
|
|
super({ fullurl: 'mainpage' })
|
|
}
|
|
baseName () {
|
|
return 'mainpage'
|
|
}
|
|
storeMetadata ( ) {
|
|
return apiPost({
|
|
action: 'query',
|
|
titles: wiki.mainPage,
|
|
redirects: '',
|
|
prop: 'info',
|
|
inprop: 'url',
|
|
})
|
|
.then( reply => {
|
|
Object.keys( reply.query.pages ).map( key => this.to = reply.query.pages[ key ])
|
|
return super.storeMetadata()
|
|
})
|
|
}
|
|
}
|
|
|
|
class Metadata extends WikiItem {
|
|
constructor ( url, data ) {
|
|
super( 'M', url)
|
|
this.mimeType = 'text/plain'
|
|
this.data_ = data
|
|
}
|
|
data () {
|
|
return this.data_
|
|
}
|
|
}
|
|
|
|
//~ const urlCache = lru( 5000 )
|
|
const urlCache = new lru({ maxSize:500 })
|
|
|
|
class PageComponent extends WikiItem {
|
|
baseName () {
|
|
let name
|
|
const urlp = urlconv.parse( this.url )
|
|
if ( urlp.query && urlp.query.includes( '=' ) && this.mimeType ) {
|
|
const pathp = osPath.parse( urlp.path )
|
|
const ext = '.' + mimeTypes.extension( this.mimeType )
|
|
name = pathp.base + ext
|
|
} else {
|
|
const pathp = osPath.parse( urlp.pathname )
|
|
name = pathp.name + pathp.ext.toLowerCase()
|
|
}
|
|
return sanitizeFN( decodeURIComponent( name ))
|
|
}
|
|
|
|
process () {
|
|
let saved = urlCache.get( this.url )
|
|
if (! saved ) {
|
|
saved = super.process()
|
|
urlCache.set( this.url, saved )
|
|
|
|
saved.then( localPath => { // do keep item's data in the cache
|
|
urlCache.set( this.url, Promise.resolve( localPath ))
|
|
})
|
|
}
|
|
return saved
|
|
}
|
|
}
|
|
|
|
class Image extends PageComponent {
|
|
constructor ( url ) {
|
|
super( 'I', url)
|
|
this.loadPriority = true
|
|
}
|
|
/*
|
|
data () {
|
|
if (! saveImages )
|
|
return null
|
|
return super.data()
|
|
}
|
|
*/
|
|
process () {
|
|
if (! saveImages )
|
|
return Promise.resolve( this.localPath() )
|
|
return super.process()
|
|
}
|
|
}
|
|
|
|
//~ const layoutFileNames = new Set()
|
|
|
|
class StyleItem extends PageComponent {
|
|
constructor ( url ) {
|
|
super( '-', url )
|
|
}
|
|
/*
|
|
checkPath (name) {
|
|
let outname = name
|
|
for (let i=1; layoutFileNames.has (outname); i++ ) {
|
|
const pname = osPath.parse (name)
|
|
outname = (pname.dir ? pname.dir + '/' : '') + `${pname.name}-${i}` + (pname.ext ? pname.ext : '')
|
|
}
|
|
layoutFileNames.add( name )
|
|
return name
|
|
}
|
|
*/
|
|
}
|
|
|
|
class FavIcon extends StyleItem {
|
|
constructor ( ) {
|
|
super( wiki.info.general.logo || 'http://www.openzim.org/w/images/e/e8/OpenZIM-wiki.png' )
|
|
}
|
|
baseName () {
|
|
return 'favicon'
|
|
}
|
|
}
|
|
|
|
const cssDependencies = new Set()
|
|
|
|
class GlobalCss extends StyleItem {
|
|
constructor ( sourceDOM ) {
|
|
super( 'layout.css' )
|
|
this.sourceDOM = sourceDOM
|
|
this.mimeType = 'text/css'
|
|
}
|
|
|
|
load () {
|
|
// get css stylesheets
|
|
const cssLinks = this.sourceDOM( 'link[rel=stylesheet][media!=print]' ).toArray()
|
|
const requests = cssLinks.map( elem => this.transformCss( elem.attribs.href ))
|
|
|
|
const stub = osPath.resolve( module.filename, '../stub.css' )
|
|
requests.unshift( fs.readFile( stub ))
|
|
|
|
return Promise.all( requests )
|
|
.then( chunks => chunks.join( '\n' ))
|
|
}
|
|
|
|
transformCss( cssUrl ) {
|
|
return Promise.coroutine( function* () {
|
|
let css = new StyleItem( cssUrl )
|
|
const src = yield css.data()
|
|
|
|
// collect urls using dummy replacements
|
|
const urlre = /(url\(['"]?)([^\)]*[^\)'"])(['"]?\))/g
|
|
const requests = []
|
|
src.replace( urlre, ( match, start, url, end ) => {
|
|
if ( ! url.startsWith( 'data:' )) {
|
|
const cssItem = new StyleItem( urlconv.resolve( cssUrl, url ))
|
|
requests.push( cssItem.process() )
|
|
}
|
|
return match
|
|
})
|
|
const resolvedUrls = yield Promise.all( requests )
|
|
const transformed = src.replace( urlre, ( match, start, url, end ) => {
|
|
const rurl = resolvedUrls.shift()
|
|
if ( rurl == null )
|
|
return match
|
|
return start + rurl.slice( 3 ) + end
|
|
})
|
|
|
|
const outcss = `/*
|
|
*
|
|
* from ${cssUrl}
|
|
*
|
|
*/
|
|
${transformed}
|
|
`
|
|
return outcss
|
|
}) ()
|
|
}
|
|
}
|
|
|
|
function processSamplePage ( samplePageUrl, rmdir) {
|
|
return Promise.coroutine( function* () {
|
|
const resp = yield requestPromise({
|
|
url: encodeurl( samplePageUrl ),
|
|
resolveWithFullResponse: true,
|
|
})
|
|
//~log(resp)
|
|
|
|
// set base for further http requests
|
|
const realUrl = resp.request.href
|
|
http = pooledRequest( requestPromise, realUrl, 1, 1 )
|
|
|
|
// create download directory
|
|
const urlp = urlconv.parse( realUrl )
|
|
wiki.saveDir = sanitizeFN( urlp.hostname )
|
|
if ( rmdir )
|
|
yield fs.remove( wiki.saveDir )
|
|
yield fs.mkdirs( wiki.saveDir )
|
|
|
|
const dom = cheerio.load( resp.body )
|
|
const historyLink = dom('#ca-history a').attr('href')
|
|
//~log(resp.request.href, historyLink, urlconv.resolve(resp.request.href, historyLink))
|
|
const parsedUrl = urlconv.parse(urlconv.resolve(resp.request.href, historyLink))
|
|
log(parsedUrl)
|
|
parsedUrl.search = null
|
|
parsedUrl.hash = null
|
|
const indexPhp = urlconv.format(parsedUrl)
|
|
parsedUrl.pathname = parsedUrl.pathname.replace('index.php', 'api.php')
|
|
|
|
wiki.apiUrl = urlconv.format(parsedUrl)
|
|
log(indexPhp, wiki.apiUrl)
|
|
|
|
return dom
|
|
})()
|
|
}
|
|
|
|
function loadTemplate () {
|
|
const stubPath = osPath.resolve( module.filename, '../stub.html' )
|
|
return fs.readFile ( stubPath )
|
|
.then( stub => (wiki.pageTemplate = stub))
|
|
}
|
|
|
|
function getSiteInfo () {
|
|
return Promise.coroutine(function* () {
|
|
const resp = yield api ({
|
|
action: 'query',
|
|
meta: 'siteinfo',
|
|
siprop: 'general|namespaces|namespacealiases',
|
|
})
|
|
|
|
const info = resp.query
|
|
log( 'SiteInfo', info )
|
|
wiki.info = info
|
|
wiki.indexUrl = info.general.script
|
|
wiki.mainPage = info.general.mainpage
|
|
wiki.articlePath = info.general.articlepath.split('$')[0]
|
|
wiki.articleBase = info.general.base.split( wiki.articlePath )[0] + wiki.articlePath
|
|
wiki.baseParsed = urlconv.parse( wiki.articleBase )
|
|
}) ()
|
|
}
|
|
|
|
function saveMetadata () {
|
|
|
|
// Name yes A human readable identifier for the resource. It's the same across versions (should be stable across time). MUST be prefixed by the packager name. kiwix.wikipedia_en.nopics
|
|
// Title yes title of zim file English Wikipedia
|
|
// Creator yes creator(s) of the ZIM file content English speaking Wikipedia contributors
|
|
// Publisher yes creator of the ZIM file itself Wikipedia user Foobar
|
|
// Date yes create date (ISO - YYYY-MM-DD) 2009-11-21
|
|
// Description yes description of content This ZIM file contains all articles (without images) from the english Wikipedia by 2009-11-10.
|
|
// Language yes ISO639-3 language identifier (if many, comma separated) eng
|
|
// Tags no A list of tags nopic;wikipedia
|
|
// Relation no URI of external related ressources
|
|
// Source no URI of the original source http://en.wikipedia.org/
|
|
// Counter no Number of non-redirect entries per mime-type image/jpeg=5;image/gif=3;image/png=2;...
|
|
//
|
|
// Favicon a favicon (48x48) is also mandatory and should be located at /-/favicon
|
|
|
|
let lang = wiki.info.general.lang.split('-')[0] // https://www.mediawiki.org/wiki/Manual:Language#Notes
|
|
if (lang.length == 2) {
|
|
const langObj = langs.where( '1', lang )
|
|
lang = langObj['3']
|
|
}
|
|
|
|
const metadata = {
|
|
Name: 'wikizimmer.' + wiki.info.general.wikiid,
|
|
Title: wiki.info.general.sitename,
|
|
Creator: '',
|
|
Publisher: '',
|
|
Date: new Date().toISOString().split('T')[0],
|
|
Description: '',
|
|
Language: lang,
|
|
//~ Tags: '',
|
|
//~ Relation: '',
|
|
//~ Counter: '',
|
|
Source: urlconv.resolve( wiki.articleBase, wiki.info.general.server ),
|
|
}
|
|
|
|
return Promise.coroutine( function * () {
|
|
yield new MainPage().process()
|
|
yield new FavIcon().process()
|
|
|
|
for ( let i in metadata ) {
|
|
yield new Metadata( i, metadata[i] ).process()
|
|
}
|
|
}) ()
|
|
}
|
|
|
|
function saveMimeTypes () {
|
|
return Promise.coroutine( function * () {
|
|
for ( let i=0, li=mimeIds.length; i < li; i++ ) {
|
|
yield indexerDb.run(
|
|
'INSERT INTO mimeTypes (id, value) VALUES (?,?)',
|
|
[ i + 1, mimeIds[ i ]]
|
|
)
|
|
}
|
|
}) ()
|
|
}
|
|
|
|
function batchRedirects ( pageInfos ) {
|
|
if ( pageInfos.length == 0 )
|
|
return Promise.resolve()
|
|
|
|
const titles = pageInfos.map( item => item.title ).join( '|' )
|
|
|
|
return apiPost({
|
|
action: 'query',
|
|
titles,
|
|
redirects: '',
|
|
prop: 'info',
|
|
inprop: 'url',
|
|
})
|
|
.then( reply => {
|
|
//~ log( 'batchRedirects reply', reply )
|
|
|
|
const redirects = reply.query.redirects
|
|
const redirectsByFrom = {}
|
|
redirects.map( item => ( redirectsByFrom[ item.from ] = item ))
|
|
|
|
const targets = reply.query.pages
|
|
const targetsByTitle = {}
|
|
Object.keys( targets ).map( key => {
|
|
const item = targets[ key ]
|
|
targetsByTitle[ item.title ] = item
|
|
})
|
|
|
|
const done = pageInfos.map( item => {
|
|
let target = null
|
|
let rdr
|
|
for ( let from = item.title; target == null; from = rdr.to ) {
|
|
rdr = redirectsByFrom[ from ]
|
|
if ( rdr.tointerwiki != null || rdr.to == item.title )
|
|
return null // interwiki or circular redirection
|
|
target = targetsByTitle[ rdr.to ]
|
|
}
|
|
if ( target.missing != null )
|
|
return null // no target exists
|
|
if ( target.ns != 0 )
|
|
return null
|
|
item.to = target
|
|
item.toFragment = rdr.tofragment
|
|
return new Redirect( item ).process()
|
|
})
|
|
return Promise.all( done )
|
|
})
|
|
}
|
|
|
|
function batchPages ( options ) {
|
|
const pageList = options.titles
|
|
const queryPageLimit = 500
|
|
const queryMaxTitles = 50
|
|
|
|
return Promise.coroutine( function* () {
|
|
const queryOpt = {
|
|
action: 'query',
|
|
prop: 'info',
|
|
inprop: 'url',
|
|
}
|
|
if ( pageList ) {
|
|
queryOpt.titles = pageList
|
|
} else {
|
|
Object.assign(
|
|
queryOpt,
|
|
{
|
|
generator: 'allpages',
|
|
//~ gapfilterredir: redirects ? 'redirects' : 'nonredirects' ,
|
|
gaplimit: queryPageLimit,
|
|
gapnamespace: '0',
|
|
rawcontinue: '',
|
|
}
|
|
)
|
|
}
|
|
let continueFrom = ''
|
|
while ( true ) {
|
|
yield indexerDb.run(
|
|
'INSERT OR REPLACE INTO continue (id, "from") VALUES (1, ?)',
|
|
[ continueFrom ]
|
|
)
|
|
if ( continueFrom == null )
|
|
break
|
|
|
|
yield indexerDb.run( 'BEGIN' )
|
|
|
|
const resp = yield api( queryOpt )
|
|
let pages = {}
|
|
try {
|
|
pages = resp.query.pages
|
|
//~log( pages )
|
|
}
|
|
catch (e) {
|
|
log( 'getPages', 'NO PAGES' )
|
|
}
|
|
let redirects = []
|
|
const done = Object.keys( pages ).map( key => {
|
|
const pageInfo = pages[ key ]
|
|
if ( pageInfo.redirect != null ) {
|
|
log( '>>>' , pageInfo.title )
|
|
redirects.push( pageInfo )
|
|
if ( redirects.length == queryMaxTitles ) {
|
|
const res = batchRedirects( redirects )
|
|
redirects = []
|
|
return res
|
|
}
|
|
return null
|
|
}
|
|
if ( ! savePages ) {
|
|
return null
|
|
}
|
|
log( '---', pageInfo.title )
|
|
return new Article( pageInfo ).process()
|
|
})
|
|
done.push( batchRedirects( redirects ))
|
|
yield Promise.all( done )
|
|
|
|
yield indexerDb.run( 'COMMIT' )
|
|
|
|
continueFrom = null
|
|
try {
|
|
const continueKey = Object.keys( resp[ 'query-continue' ].allpages )[ 0 ]
|
|
continueFrom = resp[ 'query-continue' ].allpages[ continueKey ]
|
|
queryOpt[ continueKey ] = continueFrom
|
|
log( '...', continueFrom )
|
|
}
|
|
catch ( e ) {
|
|
log( 'getPages', 'No continue key' )
|
|
}
|
|
}
|
|
log( '**************** done' )
|
|
})()
|
|
}
|
|
|
|
function loadCss( dom ) {
|
|
if (! saveCss )
|
|
return Promise.resolve()
|
|
const css = new GlobalCss( dom )
|
|
return css.process()
|
|
}
|
|
|
|
function initMetadataStorage ( samplePageDOM ) {
|
|
|
|
var dbName = osPath.join( wiki.saveDir, 'metadata.db' )
|
|
|
|
return fs.unlink( dbName )
|
|
.catch( () => null )
|
|
.then( () => sqlite.open( dbName ))
|
|
.then( db => {
|
|
indexerDb = db
|
|
return indexerDb.exec(
|
|
'PRAGMA synchronous = OFF;' +
|
|
//~ 'PRAGMA journal_mode = OFF;' +
|
|
'PRAGMA journal_mode = WAL;' +
|
|
|
|
'BEGIN;' +
|
|
|
|
'CREATE TABLE articles (' + [
|
|
'id INTEGER PRIMARY KEY',
|
|
'mimeId INTEGER',
|
|
'revision INTEGER',
|
|
'urlKey TEXT',
|
|
'titleKey TEXT',
|
|
].join(',') +
|
|
');' +
|
|
'CREATE TABLE redirects (' +
|
|
'id INTEGER PRIMARY KEY,' +
|
|
'targetKey TEXT, ' +
|
|
'fragment TEXT ' +
|
|
');' +
|
|
'CREATE TABLE mimeTypes (' +
|
|
'id INTEGER PRIMARY KEY,' +
|
|
'value TEXT' +
|
|
');' +
|
|
'CREATE TABLE continue (' +
|
|
'id INTEGER PRIMARY KEY,' +
|
|
'"from" TEXT' +
|
|
');' +
|
|
|
|
'COMMIT;' +
|
|
''
|
|
)
|
|
}
|
|
)
|
|
.then( () => samplePageDOM )
|
|
}
|
|
|
|
function closeMetadataStorage () {
|
|
return indexerDb.close()
|
|
}
|
|
|
|
function core ( samplePage, options ) {
|
|
let titles = options.titles
|
|
saveImages = ! options.noimages
|
|
saveCss = ! options.nocss
|
|
savePages = ! options.nopages
|
|
|
|
processSamplePage( samplePage, options.rmdir )
|
|
.then( initMetadataStorage )
|
|
.then( loadCss )
|
|
.then( getSiteInfo )
|
|
.then( loadTemplate )
|
|
.then( () => batchPages( { titles }))
|
|
.then( saveMetadata )
|
|
.then( saveMimeTypes )
|
|
.then( closeMetadataStorage )
|
|
.catch( err => log( err )) // handleError
|
|
}
|
|
|
|
function main () {
|
|
|
|
argv
|
|
.version( packageInfo.version )
|
|
.arguments( '<wiki-page-URL>' )
|
|
.description( `Dump a static-HTML snapshot of a MediaWiki-powered wiki.
|
|
|
|
Where:
|
|
wiki-page-URL \t URL of a sample page at the wiki to be dumped.
|
|
\t\t\t This page's styling will be used as a template for all pages in the dump.` )
|
|
.option( '-t, --titles', 'get only titles listed (separated by "|")' )
|
|
.option( '-r, --rmdir', 'delete destination directory before processing the source' )
|
|
.option( '-noimages', "don't download images" )
|
|
.option( '-nocss', "don't page styling" )
|
|
.option( '-nopages', "don't save downloaded pages" )
|
|
.parse( process.argv )
|
|
|
|
log( argv )
|
|
|
|
const args = argv.args
|
|
|
|
core( args[0], argv )
|
|
}
|
|
|
|
main ()
|
|
;
|