wikizimmer.js: async/await

This commit is contained in:
v 2018-11-12 20:19:04 +03:00
parent 60c584ead6
commit 7efe44d042
3 changed files with 172 additions and 183 deletions

View File

@ -11,7 +11,7 @@ The major point is that `wikizimmer.js` unlike to [mwoffliner](https://github.co
The package is relatively easy to install and it can even process some wikis running rather old versions of the Mediawiki engine. The package is relatively easy to install and it can even process some wikis running rather old versions of the Mediawiki engine.
## Installation ## Installation
Requirement: `node` version >= 6.x. Requirement: `node` version >= 8.x.
### With npm globally ### With npm globally

View File

@ -42,6 +42,9 @@
"sqlite3": "*", "sqlite3": "*",
"uuid": "*" "uuid": "*"
}, },
"engines" : {
"node" : ">=8.0.0"
},
"bin": { "bin": {
"zimmer": "./zimmer.js", "zimmer": "./zimmer.js",
"wikizimmer": "./wikizimmer.js" "wikizimmer": "./wikizimmer.js"

View File

@ -796,75 +796,71 @@ class GlobalCss extends StyleItem {
.then( chunks => chunks.join( '\n' )) .then( chunks => chunks.join( '\n' ))
} }
transformCss( cssUrl ) { async transformCss( cssUrl ) {
return Promise.coroutine( function* () { let css = new StyleItem( cssUrl )
let css = new StyleItem( cssUrl ) const src = await css.data()
const src = yield css.data()
// collect urls using dummy replacements // collect urls using dummy replacements
const urlre = /(url\(['"]?)([^\)]*[^\)'"])(['"]?\))/g const urlre = /(url\(['"]?)([^\)]*[^\)'"])(['"]?\))/g
const requests = [] const requests = []
src.replace( urlre, ( match, start, url, end ) => { src.replace( urlre, ( match, start, url, end ) => {
if ( ! url.startsWith( 'data:' )) { if ( ! url.startsWith( 'data:' )) {
const cssItem = new StyleItem( urlconv.resolve( cssUrl, url )) const cssItem = new StyleItem( urlconv.resolve( cssUrl, url ))
requests.push( cssItem.process() ) requests.push( cssItem.process() )
} }
return match
})
const resolvedUrls = await Promise.all( requests )
const transformed = src.replace( urlre, ( match, start, url, end ) => {
const rurl = resolvedUrls.shift()
if ( rurl == null )
return match return match
}) return start + rurl.slice( 3 ) + end
const resolvedUrls = yield Promise.all( requests ) })
const transformed = src.replace( urlre, ( match, start, url, end ) => {
const rurl = resolvedUrls.shift()
if ( rurl == null )
return match
return start + rurl.slice( 3 ) + end
})
const outcss = `/* const outcss = `/*
* *
* from ${cssUrl} * from ${cssUrl}
* *
*/ */
${transformed} ${transformed}
` `
return outcss return outcss
}) ()
} }
} }
function processSamplePage ( samplePageUrl, rmdir) { async function processSamplePage ( samplePageUrl, rmdir) {
return Promise.coroutine( function* () { const resp = await requestPromise({
const resp = yield requestPromise({ url: encodeurl( samplePageUrl ),
url: encodeurl( samplePageUrl ), resolveWithFullResponse: true,
resolveWithFullResponse: true, })
}) //~log(resp)
//~log(resp)
// set base for further http requests // set base for further http requests
const realUrl = resp.request.href const realUrl = resp.request.href
http = pooledRequest( requestPromise, realUrl ) http = pooledRequest( requestPromise, realUrl )
// create download directory // create download directory
const urlp = urlconv.parse( realUrl ) const urlp = urlconv.parse( realUrl )
wiki.saveDir = sanitizeFN( urlp.hostname ) wiki.saveDir = sanitizeFN( urlp.hostname )
if ( rmdir ) if ( rmdir )
yield fs.remove( wiki.saveDir ) await fs.remove( wiki.saveDir )
yield fs.mkdirs( wiki.saveDir ) await fs.mkdirs( wiki.saveDir )
const dom = cheerio.load( resp.body ) const dom = cheerio.load( resp.body )
const historyLink = dom('#ca-history a').attr('href') const historyLink = dom('#ca-history a').attr('href')
//~log(resp.request.href, historyLink, urlconv.resolve(resp.request.href, historyLink)) //~log(resp.request.href, historyLink, urlconv.resolve(resp.request.href, historyLink))
const parsedUrl = urlconv.parse(urlconv.resolve(resp.request.href, historyLink)) const parsedUrl = urlconv.parse(urlconv.resolve(resp.request.href, historyLink))
log(parsedUrl) log(parsedUrl)
parsedUrl.search = null parsedUrl.search = null
parsedUrl.hash = null parsedUrl.hash = null
const indexPhp = urlconv.format(parsedUrl) const indexPhp = urlconv.format(parsedUrl)
parsedUrl.pathname = parsedUrl.pathname.replace('index.php', 'api.php') parsedUrl.pathname = parsedUrl.pathname.replace('index.php', 'api.php')
wiki.apiUrl = urlconv.format(parsedUrl) wiki.apiUrl = urlconv.format(parsedUrl)
log(indexPhp, wiki.apiUrl) log(indexPhp, wiki.apiUrl)
return dom return dom
})()
} }
function loadTemplate () { function loadTemplate () {
@ -873,27 +869,25 @@ function loadTemplate () {
.then( stub => (wiki.pageTemplate = stub)) .then( stub => (wiki.pageTemplate = stub))
} }
function getSiteInfo () { async function getSiteInfo () {
return Promise.coroutine(function* () { const resp = await api ({
const resp = yield api ({ action: 'query',
action: 'query', meta: 'siteinfo',
meta: 'siteinfo', siprop: 'general|namespaces|namespacealiases',
siprop: 'general|namespaces|namespacealiases', })
})
const info = resp.query const info = resp.query
log( 'SiteInfo', info ) log( 'SiteInfo', info )
wiki.info = info wiki.info = info
wiki.indexUrl = info.general.script wiki.indexUrl = info.general.script
wiki.mainPage = info.general.mainpage wiki.mainPage = info.general.mainpage
wiki.articlePath = info.general.articlepath.split('$')[0] wiki.articlePath = info.general.articlepath.split('$')[0]
wiki.articleBase = info.general.base.split( wiki.articlePath )[0] + wiki.articlePath wiki.articleBase = info.general.base.split( wiki.articlePath )[0] + wiki.articlePath
wiki.baseParsed = urlconv.parse( wiki.articleBase ) wiki.baseParsed = urlconv.parse( wiki.articleBase )
wiki.nameSpaces = new NameSpaceSet( info ) wiki.nameSpaces = new NameSpaceSet( info )
}) ()
} }
function saveMetadata () { async function saveMetadata () {
// Name yes A human readable identifier for the resource. It's the same across versions (should be stable across time). MUST be prefixed by the packager name. kiwix.wikipedia_en.nopics // Name yes A human readable identifier for the resource. It's the same across versions (should be stable across time). MUST be prefixed by the packager name. kiwix.wikipedia_en.nopics
// Title yes title of zim file English Wikipedia // Title yes title of zim file English Wikipedia
@ -929,25 +923,21 @@ function saveMetadata () {
Source: urlconv.resolve( wiki.articleBase, wiki.info.general.server ), Source: urlconv.resolve( wiki.articleBase, wiki.info.general.server ),
} }
return Promise.coroutine( function * () { await new MainPage().process()
yield new MainPage().process() await new FavIcon().process()
yield new FavIcon().process()
for ( let i in metadata ) { for ( let i in metadata ) {
yield new Metadata( i, metadata[i] ).process() await new Metadata( i, metadata[i] ).process()
} }
}) ()
} }
function saveMimeTypes () { async function saveMimeTypes () {
return Promise.coroutine( function * () { for ( let i=0, li=mimeIds.length; i < li; i++ ) {
for ( let i=0, li=mimeIds.length; i < li; i++ ) { await indexerDb.run(
yield indexerDb.run( 'INSERT INTO mimeTypes (id, value) VALUES (?,?)',
'INSERT INTO mimeTypes (id, value) VALUES (?,?)', [ i + 1, mimeIds[ i ]]
[ i + 1, mimeIds[ i ]] )
) }
}
}) ()
} }
function batchRedirects ( pageInfos ) { function batchRedirects ( pageInfos ) {
@ -998,107 +988,103 @@ function batchRedirects ( pageInfos ) {
}) })
} }
function batchPages ( nameSpace ) { async function batchPages ( nameSpace ) {
const queryPageLimit = 500 const queryPageLimit = 500
const queryMaxTitles = 50 const queryMaxTitles = 50
return Promise.coroutine( function* () { const exclude = command.exclude ?
const exclude = command.exclude ? new RegExp( command.exclude ) :
new RegExp( command.exclude ) : { test: () => false }
{ test: () => false } const query = {
const query = { action: 'query',
action: 'query', prop: 'info',
prop: 'info', inprop: 'url',
inprop: 'url', }
Object.assign(
query,
nameSpace == null ?
{ titles: command.titles } :
{
generator: 'allpages',
gapnamespace: nameSpace,
gaplimit: queryPageLimit,
rawcontinue: '',
} }
Object.assign( )
query,
nameSpace == null ? let continueFrom = ''
{ titles: command.titles } : while ( true ) {
{ await indexerDb.run(
generator: 'allpages', 'INSERT OR REPLACE INTO continue (id, "from") VALUES (1, ?)',
gapnamespace: nameSpace, [ continueFrom ]
gaplimit: queryPageLimit,
rawcontinue: '',
}
) )
if ( continueFrom == null )
break
let continueFrom = '' await indexerDb.run( 'BEGIN' )
while ( true ) {
yield indexerDb.run(
'INSERT OR REPLACE INTO continue (id, "from") VALUES (1, ?)',
[ continueFrom ]
)
if ( continueFrom == null )
break
yield indexerDb.run( 'BEGIN' ) const resp = await api( query )
let pages = {}
const resp = yield api( query ) try {
let pages = {} pages = resp.query.pages
try { //~ log( '*pages', pages )
pages = resp.query.pages
//~ log( '*pages', pages )
}
catch (e) {
log( 'getPages', 'NO PAGES' )
}
let redirects = []
const done = Object.keys( pages ).map( key => {
if ( parseInt( key ) < 0 ) { // no such page
return null
}
const pageInfo = pages[ key ]
if ( pageInfo.redirect != null ) {
log( '>' , pageInfo.title )
redirects.push( pageInfo )
if ( redirects.length == queryMaxTitles ) {
const res = batchRedirects( redirects )
redirects = []
return res
}
return null
}
if ( ! command.pages || exclude.test( pageInfo.title )) {
log( 'x', pageInfo.title )
return null
}
log( '#', pageInfo.title )
return new Article( pageInfo ).process()
})
done.push( batchRedirects( redirects ))
yield Promise.all( done )
yield indexerDb.run( 'COMMIT' )
continueFrom = null
try {
const continueKey = Object.keys( resp[ 'query-continue' ].allpages )[ 0 ]
continueFrom = resp[ 'query-continue' ].allpages[ continueKey ]
query[ continueKey ] = continueFrom
log( '...', continueFrom )
}
catch ( e ) {
log( 'getPages', 'No continue key' )
}
} }
})() catch (e) {
log( 'getPages', 'NO PAGES' )
}
let redirects = []
const done = Object.keys( pages ).map( key => {
if ( parseInt( key ) < 0 ) { // no such page
return null
}
const pageInfo = pages[ key ]
if ( pageInfo.redirect != null ) {
log( '>' , pageInfo.title )
redirects.push( pageInfo )
if ( redirects.length == queryMaxTitles ) {
const res = batchRedirects( redirects )
redirects = []
return res
}
return null
}
if ( ! command.pages || exclude.test( pageInfo.title )) {
log( 'x', pageInfo.title )
return null
}
log( '#', pageInfo.title )
return new Article( pageInfo ).process()
})
done.push( batchRedirects( redirects ))
await Promise.all( done )
await indexerDb.run( 'COMMIT' )
continueFrom = null
try {
const continueKey = Object.keys( resp[ 'query-continue' ].allpages )[ 0 ]
continueFrom = resp[ 'query-continue' ].allpages[ continueKey ]
query[ continueKey ] = continueFrom
log( '...', continueFrom )
}
catch ( e ) {
log( 'getPages', 'No continue key' )
}
}
} }
function getPages () { async function getPages () {
return Promise.coroutine( function* () { if ( command.titles ) {
if ( command.titles ) { log( 'Titles', command.titles )
log( 'Titles', command.titles ) await batchPages()
yield batchPages() } else {
} else { wiki.nameSpaces.init( command.nameSpaces )
wiki.nameSpaces.init( command.nameSpaces ) for ( let nameSpace of wiki.nameSpaces ) {
for ( let nameSpace of wiki.nameSpaces ) { log( 'Name Space', nameSpace )
log( 'Name Space', nameSpace ) await batchPages( nameSpace )
yield batchPages( nameSpace )
}
} }
log( '**************** done' ) }
})() log( '**************** done' )
} }
function loadCss( dom ) { function loadCss( dom ) {