#!/bin/sh ":" //# -*- mode: js -*-; exec /usr/bin/env TMPDIR=/tmp node --max-old-space-size=2000 --stack-size=42000 "$0" "$@" // node --inspect-brk "use strict" /* MIT License Copyright (c) 2017 Vadim Shlyakhov Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ const packageInfo = require('./package.json'); const os = require('os') const osProcess = require('process') const osPath = require( 'path' ) const urlconv = require('url') const crypto = require("crypto") const command = require('commander') const fs = require('fs-extra') const requestPromise = require('request-promise-native') const sqlite = require( 'sqlite' ) const cheerio = require('cheerio') const minify = require('html-minifier').minify const langs = require('langs') const encodeurl = require('encodeurl') const iconv = require('iconv-lite') const lru = require('quick-lru') const mimeTypes = require( 'mime-types' ) const mmmagic = require( 'mmmagic' ) const mimeMagic = new mmmagic.Magic( mmmagic.MAGIC_MIME_TYPE ) const moment = require("moment") require("moment-duration-format") const startTime = Date.now() const cpuCount = os.cpus().length const mimeIds = [] let articleCount = 0 let redirectCount = 0 let http // http request // https://msdn.microsoft.com/en-us/library/windows/desktop/aa365247 // just in case https://www.mediawiki.org/wiki/Manual:Page_title let sanitizeRE = /(?:[\x00-\x1F<>:"~\\\?\*]|%(?:[^0-9A-Fa-f]|[0-9A-Fa-f][^0-9A-Fa-f])|(?:[. ]$))+/g function sanitizeFN ( name ) { // after https://github.com/pillarjs/encodeurl return String( name ).replace( sanitizeRE, encodeURIComponent ).replace( /%/g, '~' ) //~ return sanitizeFilename( name, { replacement: '.' }) } function elapsedStr( from , to = Date.now()) { return moment.duration( to - from ).format('d[d]hh:mm:ss.SSS',{ stopTrim: "h" }) } function log ( ...args ) { console.log( elapsedStr( startTime ), ... args ) } function warning ( ...args ) { log( ...args ) } function fatal ( ...args ) { console.trace( elapsedStr( startTime ), ... args ) osProcess.exit( 1 ) } function mimeFromData ( data ) { return new Promise(( resolve, reject ) => mimeMagic.detect( data, ( error, mimeType ) => { if ( error ) return reject( error ) return resolve( mimeType ) }) ) } let UserAgent = `wikizimmer/${packageInfo.version} (https://github.com/vadp/zimmer email:vadp.devl@gmail.com)` const UserAgentFirefox = 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/12.0' function pooledRequest( request, referenceUri, maxTokens = 1, interval = 10 ) { const retryErrorCodes = [ 'EPROTO', 'ECONNRESET', 'ESOCKETTIMEDOUT' ] const retryStatusCodes = [ 408, 420, 423, 429, 500, 503, 504, 509, 524 ] const retryLimit = 10 const retryExternal = command.retryExternal == null ? retryLimit : command.retryExternal const requestTimeout = 5 * 60 * 1000 const refHost = urlconv.parse( referenceUri ).host const hostQueues = {} class Queue { constructor () { this.queue = [] this.timer = null this.supressTimer = null this.supressTimeout = 60 * 1000 this.tokenCounter = 0 this.interval = interval } reshedule () { if ( this.supressTimer ) return this.timer = setTimeout( () => ( this.timer = null, this.run() ), this.interval ) } pause ( query ) { clearTimeout( this.timer ) this.timer = null clearTimeout( this.supressTimer ) this.supressTimer = setTimeout( () => ( this.supressTimer = false, this.reshedule()), query.retries * this.supressTimeout ) } retry ( query, error ) { const retryCause = retryStatusCodes.includes( error.statusCode ) ? error.statusCode : error.cause && retryErrorCodes.includes( error.cause.code ) ? error.cause.code : false const maxRetries = query.external ? retryExternal : retryLimit if ( ! retryCause || query.retries > maxRetries) return false if ( query.retries > maxRetries / 2 ) { this.interval = this.interval * 2 } query.retries ++ log( 'retry request', query.retries, this.interval, error.name, retryCause, error.options.uri || error.options.url ) // , query ) this.queue.push( query ) this.pause( query ) return true } async submit ( query ) { this.tokenCounter ++ try { const reply = await request( query ) this.tokenCounter -- if ( reply ) query.resolve( reply ) else query.reject( ) this.reshedule() } catch ( error ) { this.tokenCounter -- if ( ! this.retry( query, error )) { warning( 'HTTP error', error.cause && error.cause.code || error.statusCode, error.options.uri || error.options.url ) query.reject( error ) this.reshedule() return } } } run () { if ( this.timer || this.supressTimer || this.tokenCounter >= maxTokens ) return const query = this.queue.shift() if ( query ) { //~ if ( query.retries > 0 ) //~ debugger this.submit( query ) this.reshedule() } } append ( query ) { return new Promise(( resolve, reject ) => { query.resolve = resolve query.reject = reject query.retries = 0 if ( query.priority ) this.queue.unshift( query ) else this.queue.push( query ) this.run() }) } } function processOptions ( query ) { let url if ( typeof query === 'string' || query.href !== undefined ) { // string or URL object url = query query = {} } else { url = query.uri || query.url delete query.uri } query.url = urlconv.resolve( referenceUri, url ) query.host = urlconv.parse( query.url ).host query.external = query.host != refHost if ( ! query.headers ) query.headers = {} query.headers[ 'User-Agent' ] = UserAgent query.headers[ 'Referer' ] = referenceUri query.resolveWithFullResponse = true query.timeout = requestTimeout query.forever = true log( '^', decodeURI( query.url ), query.qs || '' ) return query } return function ( query, queueId ) { processOptions( query ) if ( ! queueId ) queueId = query.host let queue = hostQueues[ queueId ] if ( ! queue ) { queue = new Queue hostQueues[ queueId ] = queue } return queue.append( query ) } } async function api ( params, options = {} ) { if ( options.method == 'POST' && options.form ) options.form.format = 'json' else params.format = 'json' Object.assign( options, { url: wiki.apiUrl, qs: params, }) const reply = await http( options ) const res = JSON.parse( reply.body ) return res.error || res.warning ? Promise.reject( res.error || res.warning ) : res } function apiPost( params ) { return api( null, { method: 'POST', form: params, }) } class NameSpaceSet { constructor ( SiteInfo ) { this.nameSpaces = {} this.queue = [] this.scheduled = new Set Object.keys( SiteInfo.namespaces ).forEach( ns => { const nsInfo = SiteInfo.namespaces[ ns ] this.nameSpaces[ ns ] = nsInfo if ( nsInfo[ '*' ] !== undefined ) this.nameSpaces[ nsInfo[ '*' ]] = nsInfo if ( nsInfo.canonical !== undefined ) this.nameSpaces[ nsInfo.canonical ] = nsInfo }) if ( SiteInfo.namespacealiases ) { SiteInfo.namespacealiases.forEach( aliasInfo => this.nameSpaces[ aliasInfo[ '*' ]] = this.nameSpaces[ aliasInfo.id ] ) } } isScheduled ( nsId ) { return this.scheduled.has( nsId ) } toBeDownloaded ( title ) { const colIndex = title.indexOf( ':' ) if ( colIndex == -1 ) return true const prefix = title.slice( 0, colIndex ) const ns = this.nameSpaces[ prefix ] if ( ns !== undefined ) { return this.isScheduled( ns.id ) } return true } toDownload ( nsList = '0' ) { nsList.split( ',' ).map( nsId => this.schedule( nsId )) } schedule ( nsId ) { const ns = this.nameSpaces[ nsId ] if ( ! ns ) { fatal( 'This wiki does not have name space', nsId ) return } if ( ! this.isScheduled( ns.id )) { this.scheduled.add( ns.id ) this.queue.push( ns.id ) } } * [Symbol.iterator] () { while ( this.queue.length != 0 ) { yield this.queue.shift() } } } const wiki = { saveDir: null, apiUrl: null, metadata: {}, nameSpaces: null, } class WikiItem { constructor ( zimNameSpace, url, title ) { this.encoding = null this.revision = 0 this.id = null this.loadPriority = false Object.assign( this, { zimNameSpace, url, title }) } async getData () { let data = await ( this.data !== undefined ? this.data : ( this.data = this.load( ))) return this.preProcess( data ) } preProcess ( data ) { return data } urlReplacements () { if ( typeof command.urlReplace != 'object' ) { return this.url } else { return command.urlReplace.reduce( ( acc, [ patt, repl ]) => acc.replace( patt, repl ), this.url ) } } blackListed () { if ( typeof command.urlBlacklist != 'object' ) { return false } return command.urlBlacklist.some( patt => this.url.includes( patt )) } async load () { let resp try { resp = await http({ url: this.urlReplacements(), encoding: null, priority: this.loadPriority }) } catch ( error ) { if ( ! command.downloadErrors || error.options.external || error.statusCode == 404 || error.statusCode == 400 ) { throw error } fatal( 'Fatal load error' ) //~ return Promise.reject( new Error( 'Load error' )) } let data = resp.body this.url = resp.request.href // possibly redirected this.headers = resp.headers if ( ! this.revision ) { const modified = this.headers[ 'last-modified' ] // "Tue, 27 Jun 2017 14:37:49 GMT" const dateBasedRevision = Math.round(( Date.parse( modified ) - Date.parse( '2000-01-01' )) / 1000 ) || 0 this.revision = dateBasedRevision } const contentType = resp.headers[ "content-type" ] let csplit = contentType.split( ';' ) this.mimeType = csplit[ 0 ] if ( this.mimeType.split( '/' )[ 0 ] == 'text' ) { this.encoding = 'utf-8' if ( csplit.length > 1 && csplit[ 1 ].includes( 'charset=' )) { this.encoding = csplit[ 1 ].split( '=' )[ 1 ] } } if ( this.mimeType == 'application/x-www-form-urlencoded' ) { try { const mimeType = await mimeFromData( data ) this.mimeType = mimeType return data } catch ( err ) { } } if ( Buffer.isBuffer( data ) && this.encoding != null ) { data = iconv.decode( data, this.encoding ) } return data } basePath () { const purl = urlconv.parse( this.url ) const pathp = osPath.parse( purl.pathname ) return sanitizeFN( decodeURIComponent( pathp.base )) } localPath () { return this.zimNameSpace + '/' + this.basePath() } pathToTop () { return '../'.repeat( this.basePath().split( '/' ).length - 1 ) } urlKey () { return this.zimNameSpace + this.basePath() } titleKey () { return this.title ? this.zimNameSpace + this.title : this.urlKey() } mimeId () { if ( this.mimeType == null ) fatal( 'this.mimeType == null', this ) let id = mimeIds.indexOf( this.mimeType ) if ( id == -1 ) { id = mimeIds.length mimeIds.push( this.mimeType ) } return id } storeData ( data ) { if ( data == null ) return const savePath = osPath.join( wiki.saveDir, this.localPath()) log( '+', savePath ) return fs.outputFile( savePath, data ) } async storeMetadata ( ) { const row = [ this.urlKey(), this.titleKey(), this.revision, this.mimeId(), ] try { const res = await wiki.db.run( 'INSERT INTO articles ( urlKey, titleKey, revision, mimeId ) VALUES ( ?,?,?,? )', row ) //~ log( 'storeMetadata res', row, res ) this.id = res.stmt.lastID ++ articleCount return this.id } catch ( err ) { if ( err.code == "SQLITE_CONSTRAINT" ) return null fatal( 'storeMetadata error', err ) } } async save () { if ( this.blackListed() ) return '' try { const data = await this.getData() await this.storeData( data ) await this.storeMetadata() return this.localPath() } catch ( err ) { warning( 'Save error', err.name, this.url, '->', this.localPath()) return '' } } } // { // "pageid": 10, // "ns": 0, // "title": "Baltic Sea", // "touched": "2017-06-27T14:37:49Z", // "lastrevid": 168879, // "counter": 62340, // "length": 9324, // "fullurl": "http:\/\/www.cruiserswiki.org\/wiki\/Baltic_Sea", // "editurl": "http:\/\/www.cruiserswiki.org\/index.php?title=Baltic_Sea&action=edit" // } // { // "ns": 0, // "title": "Anchorages of Lesvos Island", // "missing": "", // "fullurl": "http:\/\/www.cruiserswiki.org\/wiki\/Anchorages_of_Lesvos_Island", // "editurl": "http:\/\/www.cruiserswiki.org\/index.php?title=Anchorages_of_Lesvos_Island&action=edit" // } class ArticleStub extends WikiItem { constructor ( pageInfo ) { super( 'A', urlconv.resolve( wiki.articleUriPrefix, pageInfo.fullurl ), pageInfo.title ) this.info = pageInfo this.mwId = pageInfo.pageid this.revision = pageInfo.lastrevid } getTitle () { if ( this.title ) return this.title if ( this.url && this.url.startsWith( wiki.articleUriPrefix )) { const urlParsed = urlconv.parse( this.url ) const subPath = urlParsed.pathname.replace( wiki.articlePath, '' ).replace( /_/g, ' ' ) return decodeURIComponent( subPath ) } return null // not a local article } basePath () { if ( this.url && this.url.startsWith( wiki.articleUriPrefix )) { const urlParsed = urlconv.parse( this.url ) const subPath = urlParsed.pathname.replace( wiki.articlePath, '' ) return sanitizeFN( decodeURIComponent( subPath )) + '.html' } return null // not a local article } } class Article extends ArticleStub { constructor ( pageInfo ) { super( pageInfo ) } async preProcess( data ) { let src try { src = cheerio.load( data ) } catch ( e ) { log( 'cheerio.load error', e, data ) return data } try { let content = [] if ( command.content ) { content = src( command.content ) } else { content = src( '#bodyContent' ) if ( content.length == 0 ) { content = src( 'article' ) //wikia } } if ( content.length == 0 ) { fatal( "Article.preProcess -- fatal error: Can't find article's content:", this.title ) } const dom = cheerio.load( wiki.pageTemplate ) dom( 'title' ).text( this.title ) dom( '#bodyContent' ).replaceWith( content ) // display content inside