wikizimmer.js: async/await
This commit is contained in:
parent
60c584ead6
commit
7efe44d042
|
@ -11,7 +11,7 @@ The major point is that `wikizimmer.js` unlike to [mwoffliner](https://github.co
|
||||||
The package is relatively easy to install and it can even process some wikis running rather old versions of the Mediawiki engine.
|
The package is relatively easy to install and it can even process some wikis running rather old versions of the Mediawiki engine.
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
Requirement: `node` version >= 6.x.
|
Requirement: `node` version >= 8.x.
|
||||||
|
|
||||||
### With npm globally
|
### With npm globally
|
||||||
|
|
||||||
|
|
|
@ -42,6 +42,9 @@
|
||||||
"sqlite3": "*",
|
"sqlite3": "*",
|
||||||
"uuid": "*"
|
"uuid": "*"
|
||||||
},
|
},
|
||||||
|
"engines" : {
|
||||||
|
"node" : ">=8.0.0"
|
||||||
|
},
|
||||||
"bin": {
|
"bin": {
|
||||||
"zimmer": "./zimmer.js",
|
"zimmer": "./zimmer.js",
|
||||||
"wikizimmer": "./wikizimmer.js"
|
"wikizimmer": "./wikizimmer.js"
|
||||||
|
|
350
wikizimmer.js
350
wikizimmer.js
|
@ -796,75 +796,71 @@ class GlobalCss extends StyleItem {
|
||||||
.then( chunks => chunks.join( '\n' ))
|
.then( chunks => chunks.join( '\n' ))
|
||||||
}
|
}
|
||||||
|
|
||||||
transformCss( cssUrl ) {
|
async transformCss( cssUrl ) {
|
||||||
return Promise.coroutine( function* () {
|
let css = new StyleItem( cssUrl )
|
||||||
let css = new StyleItem( cssUrl )
|
const src = await css.data()
|
||||||
const src = yield css.data()
|
|
||||||
|
|
||||||
// collect urls using dummy replacements
|
// collect urls using dummy replacements
|
||||||
const urlre = /(url\(['"]?)([^\)]*[^\)'"])(['"]?\))/g
|
const urlre = /(url\(['"]?)([^\)]*[^\)'"])(['"]?\))/g
|
||||||
const requests = []
|
const requests = []
|
||||||
src.replace( urlre, ( match, start, url, end ) => {
|
src.replace( urlre, ( match, start, url, end ) => {
|
||||||
if ( ! url.startsWith( 'data:' )) {
|
if ( ! url.startsWith( 'data:' )) {
|
||||||
const cssItem = new StyleItem( urlconv.resolve( cssUrl, url ))
|
const cssItem = new StyleItem( urlconv.resolve( cssUrl, url ))
|
||||||
requests.push( cssItem.process() )
|
requests.push( cssItem.process() )
|
||||||
}
|
}
|
||||||
|
return match
|
||||||
|
})
|
||||||
|
const resolvedUrls = await Promise.all( requests )
|
||||||
|
const transformed = src.replace( urlre, ( match, start, url, end ) => {
|
||||||
|
const rurl = resolvedUrls.shift()
|
||||||
|
if ( rurl == null )
|
||||||
return match
|
return match
|
||||||
})
|
return start + rurl.slice( 3 ) + end
|
||||||
const resolvedUrls = yield Promise.all( requests )
|
})
|
||||||
const transformed = src.replace( urlre, ( match, start, url, end ) => {
|
|
||||||
const rurl = resolvedUrls.shift()
|
|
||||||
if ( rurl == null )
|
|
||||||
return match
|
|
||||||
return start + rurl.slice( 3 ) + end
|
|
||||||
})
|
|
||||||
|
|
||||||
const outcss = `/*
|
const outcss = `/*
|
||||||
*
|
*
|
||||||
* from ${cssUrl}
|
* from ${cssUrl}
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
${transformed}
|
${transformed}
|
||||||
`
|
`
|
||||||
return outcss
|
return outcss
|
||||||
}) ()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function processSamplePage ( samplePageUrl, rmdir) {
|
async function processSamplePage ( samplePageUrl, rmdir) {
|
||||||
return Promise.coroutine( function* () {
|
const resp = await requestPromise({
|
||||||
const resp = yield requestPromise({
|
url: encodeurl( samplePageUrl ),
|
||||||
url: encodeurl( samplePageUrl ),
|
resolveWithFullResponse: true,
|
||||||
resolveWithFullResponse: true,
|
})
|
||||||
})
|
//~log(resp)
|
||||||
//~log(resp)
|
|
||||||
|
|
||||||
// set base for further http requests
|
// set base for further http requests
|
||||||
const realUrl = resp.request.href
|
const realUrl = resp.request.href
|
||||||
http = pooledRequest( requestPromise, realUrl )
|
http = pooledRequest( requestPromise, realUrl )
|
||||||
|
|
||||||
// create download directory
|
// create download directory
|
||||||
const urlp = urlconv.parse( realUrl )
|
const urlp = urlconv.parse( realUrl )
|
||||||
wiki.saveDir = sanitizeFN( urlp.hostname )
|
wiki.saveDir = sanitizeFN( urlp.hostname )
|
||||||
if ( rmdir )
|
if ( rmdir )
|
||||||
yield fs.remove( wiki.saveDir )
|
await fs.remove( wiki.saveDir )
|
||||||
yield fs.mkdirs( wiki.saveDir )
|
await fs.mkdirs( wiki.saveDir )
|
||||||
|
|
||||||
const dom = cheerio.load( resp.body )
|
const dom = cheerio.load( resp.body )
|
||||||
const historyLink = dom('#ca-history a').attr('href')
|
const historyLink = dom('#ca-history a').attr('href')
|
||||||
//~log(resp.request.href, historyLink, urlconv.resolve(resp.request.href, historyLink))
|
//~log(resp.request.href, historyLink, urlconv.resolve(resp.request.href, historyLink))
|
||||||
const parsedUrl = urlconv.parse(urlconv.resolve(resp.request.href, historyLink))
|
const parsedUrl = urlconv.parse(urlconv.resolve(resp.request.href, historyLink))
|
||||||
log(parsedUrl)
|
log(parsedUrl)
|
||||||
parsedUrl.search = null
|
parsedUrl.search = null
|
||||||
parsedUrl.hash = null
|
parsedUrl.hash = null
|
||||||
const indexPhp = urlconv.format(parsedUrl)
|
const indexPhp = urlconv.format(parsedUrl)
|
||||||
parsedUrl.pathname = parsedUrl.pathname.replace('index.php', 'api.php')
|
parsedUrl.pathname = parsedUrl.pathname.replace('index.php', 'api.php')
|
||||||
|
|
||||||
wiki.apiUrl = urlconv.format(parsedUrl)
|
wiki.apiUrl = urlconv.format(parsedUrl)
|
||||||
log(indexPhp, wiki.apiUrl)
|
log(indexPhp, wiki.apiUrl)
|
||||||
|
|
||||||
return dom
|
return dom
|
||||||
})()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function loadTemplate () {
|
function loadTemplate () {
|
||||||
|
@ -873,27 +869,25 @@ function loadTemplate () {
|
||||||
.then( stub => (wiki.pageTemplate = stub))
|
.then( stub => (wiki.pageTemplate = stub))
|
||||||
}
|
}
|
||||||
|
|
||||||
function getSiteInfo () {
|
async function getSiteInfo () {
|
||||||
return Promise.coroutine(function* () {
|
const resp = await api ({
|
||||||
const resp = yield api ({
|
action: 'query',
|
||||||
action: 'query',
|
meta: 'siteinfo',
|
||||||
meta: 'siteinfo',
|
siprop: 'general|namespaces|namespacealiases',
|
||||||
siprop: 'general|namespaces|namespacealiases',
|
})
|
||||||
})
|
|
||||||
|
|
||||||
const info = resp.query
|
const info = resp.query
|
||||||
log( 'SiteInfo', info )
|
log( 'SiteInfo', info )
|
||||||
wiki.info = info
|
wiki.info = info
|
||||||
wiki.indexUrl = info.general.script
|
wiki.indexUrl = info.general.script
|
||||||
wiki.mainPage = info.general.mainpage
|
wiki.mainPage = info.general.mainpage
|
||||||
wiki.articlePath = info.general.articlepath.split('$')[0]
|
wiki.articlePath = info.general.articlepath.split('$')[0]
|
||||||
wiki.articleBase = info.general.base.split( wiki.articlePath )[0] + wiki.articlePath
|
wiki.articleBase = info.general.base.split( wiki.articlePath )[0] + wiki.articlePath
|
||||||
wiki.baseParsed = urlconv.parse( wiki.articleBase )
|
wiki.baseParsed = urlconv.parse( wiki.articleBase )
|
||||||
wiki.nameSpaces = new NameSpaceSet( info )
|
wiki.nameSpaces = new NameSpaceSet( info )
|
||||||
}) ()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function saveMetadata () {
|
async function saveMetadata () {
|
||||||
|
|
||||||
// Name yes A human readable identifier for the resource. It's the same across versions (should be stable across time). MUST be prefixed by the packager name. kiwix.wikipedia_en.nopics
|
// Name yes A human readable identifier for the resource. It's the same across versions (should be stable across time). MUST be prefixed by the packager name. kiwix.wikipedia_en.nopics
|
||||||
// Title yes title of zim file English Wikipedia
|
// Title yes title of zim file English Wikipedia
|
||||||
|
@ -929,25 +923,21 @@ function saveMetadata () {
|
||||||
Source: urlconv.resolve( wiki.articleBase, wiki.info.general.server ),
|
Source: urlconv.resolve( wiki.articleBase, wiki.info.general.server ),
|
||||||
}
|
}
|
||||||
|
|
||||||
return Promise.coroutine( function * () {
|
await new MainPage().process()
|
||||||
yield new MainPage().process()
|
await new FavIcon().process()
|
||||||
yield new FavIcon().process()
|
|
||||||
|
|
||||||
for ( let i in metadata ) {
|
for ( let i in metadata ) {
|
||||||
yield new Metadata( i, metadata[i] ).process()
|
await new Metadata( i, metadata[i] ).process()
|
||||||
}
|
}
|
||||||
}) ()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function saveMimeTypes () {
|
async function saveMimeTypes () {
|
||||||
return Promise.coroutine( function * () {
|
for ( let i=0, li=mimeIds.length; i < li; i++ ) {
|
||||||
for ( let i=0, li=mimeIds.length; i < li; i++ ) {
|
await indexerDb.run(
|
||||||
yield indexerDb.run(
|
'INSERT INTO mimeTypes (id, value) VALUES (?,?)',
|
||||||
'INSERT INTO mimeTypes (id, value) VALUES (?,?)',
|
[ i + 1, mimeIds[ i ]]
|
||||||
[ i + 1, mimeIds[ i ]]
|
)
|
||||||
)
|
}
|
||||||
}
|
|
||||||
}) ()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function batchRedirects ( pageInfos ) {
|
function batchRedirects ( pageInfos ) {
|
||||||
|
@ -998,107 +988,103 @@ function batchRedirects ( pageInfos ) {
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
function batchPages ( nameSpace ) {
|
async function batchPages ( nameSpace ) {
|
||||||
const queryPageLimit = 500
|
const queryPageLimit = 500
|
||||||
const queryMaxTitles = 50
|
const queryMaxTitles = 50
|
||||||
|
|
||||||
return Promise.coroutine( function* () {
|
const exclude = command.exclude ?
|
||||||
const exclude = command.exclude ?
|
new RegExp( command.exclude ) :
|
||||||
new RegExp( command.exclude ) :
|
{ test: () => false }
|
||||||
{ test: () => false }
|
const query = {
|
||||||
const query = {
|
action: 'query',
|
||||||
action: 'query',
|
prop: 'info',
|
||||||
prop: 'info',
|
inprop: 'url',
|
||||||
inprop: 'url',
|
}
|
||||||
|
Object.assign(
|
||||||
|
query,
|
||||||
|
nameSpace == null ?
|
||||||
|
{ titles: command.titles } :
|
||||||
|
{
|
||||||
|
generator: 'allpages',
|
||||||
|
gapnamespace: nameSpace,
|
||||||
|
gaplimit: queryPageLimit,
|
||||||
|
rawcontinue: '',
|
||||||
}
|
}
|
||||||
Object.assign(
|
)
|
||||||
query,
|
|
||||||
nameSpace == null ?
|
let continueFrom = ''
|
||||||
{ titles: command.titles } :
|
while ( true ) {
|
||||||
{
|
await indexerDb.run(
|
||||||
generator: 'allpages',
|
'INSERT OR REPLACE INTO continue (id, "from") VALUES (1, ?)',
|
||||||
gapnamespace: nameSpace,
|
[ continueFrom ]
|
||||||
gaplimit: queryPageLimit,
|
|
||||||
rawcontinue: '',
|
|
||||||
}
|
|
||||||
)
|
)
|
||||||
|
if ( continueFrom == null )
|
||||||
|
break
|
||||||
|
|
||||||
let continueFrom = ''
|
await indexerDb.run( 'BEGIN' )
|
||||||
while ( true ) {
|
|
||||||
yield indexerDb.run(
|
|
||||||
'INSERT OR REPLACE INTO continue (id, "from") VALUES (1, ?)',
|
|
||||||
[ continueFrom ]
|
|
||||||
)
|
|
||||||
if ( continueFrom == null )
|
|
||||||
break
|
|
||||||
|
|
||||||
yield indexerDb.run( 'BEGIN' )
|
const resp = await api( query )
|
||||||
|
let pages = {}
|
||||||
const resp = yield api( query )
|
try {
|
||||||
let pages = {}
|
pages = resp.query.pages
|
||||||
try {
|
//~ log( '*pages', pages )
|
||||||
pages = resp.query.pages
|
|
||||||
//~ log( '*pages', pages )
|
|
||||||
}
|
|
||||||
catch (e) {
|
|
||||||
log( 'getPages', 'NO PAGES' )
|
|
||||||
}
|
|
||||||
let redirects = []
|
|
||||||
const done = Object.keys( pages ).map( key => {
|
|
||||||
if ( parseInt( key ) < 0 ) { // no such page
|
|
||||||
return null
|
|
||||||
}
|
|
||||||
const pageInfo = pages[ key ]
|
|
||||||
if ( pageInfo.redirect != null ) {
|
|
||||||
log( '>' , pageInfo.title )
|
|
||||||
redirects.push( pageInfo )
|
|
||||||
if ( redirects.length == queryMaxTitles ) {
|
|
||||||
const res = batchRedirects( redirects )
|
|
||||||
redirects = []
|
|
||||||
return res
|
|
||||||
}
|
|
||||||
return null
|
|
||||||
}
|
|
||||||
if ( ! command.pages || exclude.test( pageInfo.title )) {
|
|
||||||
log( 'x', pageInfo.title )
|
|
||||||
return null
|
|
||||||
}
|
|
||||||
log( '#', pageInfo.title )
|
|
||||||
return new Article( pageInfo ).process()
|
|
||||||
})
|
|
||||||
done.push( batchRedirects( redirects ))
|
|
||||||
yield Promise.all( done )
|
|
||||||
|
|
||||||
yield indexerDb.run( 'COMMIT' )
|
|
||||||
|
|
||||||
continueFrom = null
|
|
||||||
try {
|
|
||||||
const continueKey = Object.keys( resp[ 'query-continue' ].allpages )[ 0 ]
|
|
||||||
continueFrom = resp[ 'query-continue' ].allpages[ continueKey ]
|
|
||||||
query[ continueKey ] = continueFrom
|
|
||||||
log( '...', continueFrom )
|
|
||||||
}
|
|
||||||
catch ( e ) {
|
|
||||||
log( 'getPages', 'No continue key' )
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
})()
|
catch (e) {
|
||||||
|
log( 'getPages', 'NO PAGES' )
|
||||||
|
}
|
||||||
|
let redirects = []
|
||||||
|
const done = Object.keys( pages ).map( key => {
|
||||||
|
if ( parseInt( key ) < 0 ) { // no such page
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
const pageInfo = pages[ key ]
|
||||||
|
if ( pageInfo.redirect != null ) {
|
||||||
|
log( '>' , pageInfo.title )
|
||||||
|
redirects.push( pageInfo )
|
||||||
|
if ( redirects.length == queryMaxTitles ) {
|
||||||
|
const res = batchRedirects( redirects )
|
||||||
|
redirects = []
|
||||||
|
return res
|
||||||
|
}
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
if ( ! command.pages || exclude.test( pageInfo.title )) {
|
||||||
|
log( 'x', pageInfo.title )
|
||||||
|
return null
|
||||||
|
}
|
||||||
|
log( '#', pageInfo.title )
|
||||||
|
return new Article( pageInfo ).process()
|
||||||
|
})
|
||||||
|
done.push( batchRedirects( redirects ))
|
||||||
|
await Promise.all( done )
|
||||||
|
|
||||||
|
await indexerDb.run( 'COMMIT' )
|
||||||
|
|
||||||
|
continueFrom = null
|
||||||
|
try {
|
||||||
|
const continueKey = Object.keys( resp[ 'query-continue' ].allpages )[ 0 ]
|
||||||
|
continueFrom = resp[ 'query-continue' ].allpages[ continueKey ]
|
||||||
|
query[ continueKey ] = continueFrom
|
||||||
|
log( '...', continueFrom )
|
||||||
|
}
|
||||||
|
catch ( e ) {
|
||||||
|
log( 'getPages', 'No continue key' )
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
function getPages () {
|
async function getPages () {
|
||||||
return Promise.coroutine( function* () {
|
if ( command.titles ) {
|
||||||
if ( command.titles ) {
|
log( 'Titles', command.titles )
|
||||||
log( 'Titles', command.titles )
|
await batchPages()
|
||||||
yield batchPages()
|
} else {
|
||||||
} else {
|
wiki.nameSpaces.init( command.nameSpaces )
|
||||||
wiki.nameSpaces.init( command.nameSpaces )
|
for ( let nameSpace of wiki.nameSpaces ) {
|
||||||
for ( let nameSpace of wiki.nameSpaces ) {
|
log( 'Name Space', nameSpace )
|
||||||
log( 'Name Space', nameSpace )
|
await batchPages( nameSpace )
|
||||||
yield batchPages( nameSpace )
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
log( '**************** done' )
|
}
|
||||||
})()
|
log( '**************** done' )
|
||||||
}
|
}
|
||||||
|
|
||||||
function loadCss( dom ) {
|
function loadCss( dom ) {
|
||||||
|
|
Loading…
Reference in New Issue
Block a user