wikizimmer.js: async/await

2018-11-12 20:19:04 +03:00 · 2018-11-12 20:19:04 +03:00 · 7efe44d042
commit 7efe44d042
parent 60c584ead6
3 changed files with 172 additions and 183 deletions
--- a/README.md
+++ b/README.md
@ -11,7 +11,7 @@ The major point is that `wikizimmer.js` unlike to [mwoffliner](https://github.co
 The package is relatively easy to install and it can even process some wikis running rather old versions of the Mediawiki engine.
 ## Installation
-Requirement: `node` version >= 6.x.
+Requirement: `node` version >= 8.x.
 ### With npm globally
--- a/package.json
+++ b/package.json
@ -42,6 +42,9 @@
    "sqlite3": "*",
    "uuid": "*"
  },
  "engines" : {
    "node" : ">=8.0.0"
  },
  "bin": {
    "zimmer": "./zimmer.js",
    "wikizimmer": "./wikizimmer.js"
--- a/wikizimmer.js
+++ b/wikizimmer.js
@ -796,75 +796,71 @@ class GlobalCss extends StyleItem {
        .then( chunks => chunks.join( '\n' ))
    }
-    transformCss( cssUrl ) {
+    async transformCss( cssUrl ) {
-        return Promise.coroutine( function* () {
+        let css = new StyleItem( cssUrl )
-            let css = new StyleItem( cssUrl )
+        const src = await css.data()
            const src = yield css.data()
-            // collect urls using dummy replacements
+        // collect urls using dummy replacements
-            const urlre = /(url\(['"]?)([^\)]*[^\)'"])(['"]?\))/g
+        const urlre = /(url\(['"]?)([^\)]*[^\)'"])(['"]?\))/g
-            const requests = []
+        const requests = []
-            src.replace( urlre, ( match, start, url, end ) => {
+        src.replace( urlre, ( match, start, url, end ) => {
-                if ( ! url.startsWith( 'data:' )) {
+            if ( ! url.startsWith( 'data:' )) {
-                    const cssItem = new StyleItem( urlconv.resolve( cssUrl, url ))
+                const cssItem = new StyleItem( urlconv.resolve( cssUrl, url ))
-                    requests.push( cssItem.process() )
+                requests.push( cssItem.process() )
-                }
+            }
            return match
        })
        const resolvedUrls = await Promise.all( requests )
        const transformed = src.replace( urlre, ( match, start, url, end ) => {
            const rurl = resolvedUrls.shift()
            if ( rurl == null )
                return match
-            })
+            return start + rurl.slice( 3 ) + end
-            const resolvedUrls = yield Promise.all( requests )
+        })
            const transformed = src.replace( urlre, ( match, start, url, end ) => {
                const rurl = resolvedUrls.shift()
                if ( rurl == null )
                    return match
                return start + rurl.slice( 3 ) + end
            })
-            const outcss = `/*
+        const outcss = `/*
- *
+*
- * from ${cssUrl}
+* from ${cssUrl}
- *
+*
- */
+*/
-    ${transformed}
+${transformed}
-    `
+`
-            return outcss
+        return outcss
        }) ()
    }
 }
-function processSamplePage ( samplePageUrl,  rmdir) {
+async function processSamplePage ( samplePageUrl,  rmdir) {
-    return Promise.coroutine( function* () {
+    const resp = await requestPromise({
-        const resp = yield requestPromise({
+        url: encodeurl( samplePageUrl ),
-            url: encodeurl( samplePageUrl ),
+        resolveWithFullResponse: true,
-            resolveWithFullResponse: true,
+    })
-        })
+    //~log(resp)
        //~log(resp)
-        // set base for further http requests
+    // set base for further http requests
-        const realUrl = resp.request.href
+    const realUrl = resp.request.href
-        http = pooledRequest( requestPromise, realUrl )
+    http = pooledRequest( requestPromise, realUrl )
-        // create download directory
+    // create download directory
-        const urlp = urlconv.parse( realUrl )
+    const urlp = urlconv.parse( realUrl )
-        wiki.saveDir = sanitizeFN( urlp.hostname )
+    wiki.saveDir = sanitizeFN( urlp.hostname )
-        if ( rmdir )
+    if ( rmdir )
-            yield fs.remove( wiki.saveDir )
+        await fs.remove( wiki.saveDir )
-        yield fs.mkdirs( wiki.saveDir )
+    await fs.mkdirs( wiki.saveDir )
-        const dom = cheerio.load( resp.body )
+    const dom = cheerio.load( resp.body )
-        const historyLink = dom('#ca-history a').attr('href')
+    const historyLink = dom('#ca-history a').attr('href')
-        //~log(resp.request.href, historyLink, urlconv.resolve(resp.request.href, historyLink))
+    //~log(resp.request.href, historyLink, urlconv.resolve(resp.request.href, historyLink))
-        const parsedUrl = urlconv.parse(urlconv.resolve(resp.request.href, historyLink))
+    const parsedUrl = urlconv.parse(urlconv.resolve(resp.request.href, historyLink))
-        log(parsedUrl)
+    log(parsedUrl)
-        parsedUrl.search = null
+    parsedUrl.search = null
-        parsedUrl.hash = null
+    parsedUrl.hash = null
-        const indexPhp = urlconv.format(parsedUrl)
+    const indexPhp = urlconv.format(parsedUrl)
-        parsedUrl.pathname = parsedUrl.pathname.replace('index.php', 'api.php')
+    parsedUrl.pathname = parsedUrl.pathname.replace('index.php', 'api.php')
-        wiki.apiUrl = urlconv.format(parsedUrl)
+    wiki.apiUrl = urlconv.format(parsedUrl)
-        log(indexPhp, wiki.apiUrl)
+    log(indexPhp, wiki.apiUrl)
-        return dom
+    return dom
    })()
 }
 function loadTemplate () {
@ -873,27 +869,25 @@ function loadTemplate () {
    .then( stub => (wiki.pageTemplate = stub))
 }
-function getSiteInfo () {
+async function getSiteInfo () {
-    return Promise.coroutine(function* () {
+    const resp = await api ({
-        const resp = yield api ({
+        action: 'query',
-            action: 'query',
+        meta: 'siteinfo',
-            meta: 'siteinfo',
+        siprop: 'general|namespaces|namespacealiases',
-            siprop: 'general|namespaces|namespacealiases',
+    })
        })
-        const info = resp.query
+    const info = resp.query
-        log( 'SiteInfo', info )
+    log( 'SiteInfo', info )
-        wiki.info = info
+    wiki.info = info
-        wiki.indexUrl = info.general.script
+    wiki.indexUrl = info.general.script
-        wiki.mainPage = info.general.mainpage
+    wiki.mainPage = info.general.mainpage
-        wiki.articlePath = info.general.articlepath.split('$')[0]
+    wiki.articlePath = info.general.articlepath.split('$')[0]
-        wiki.articleBase = info.general.base.split( wiki.articlePath )[0] + wiki.articlePath
+    wiki.articleBase = info.general.base.split( wiki.articlePath )[0] + wiki.articlePath
-        wiki.baseParsed = urlconv.parse( wiki.articleBase )
+    wiki.baseParsed = urlconv.parse( wiki.articleBase )
-        wiki.nameSpaces = new NameSpaceSet( info )
+    wiki.nameSpaces = new NameSpaceSet( info )
    }) ()
 }
-function saveMetadata () {
+async function saveMetadata () {
    // Name         yes     A human readable identifier for the resource. It's the same across versions (should be stable across time). MUST be prefixed by the packager name.  kiwix.wikipedia_en.nopics
    // Title        yes     title of zim file   English Wikipedia
@ -929,25 +923,21 @@ function saveMetadata () {
        Source: urlconv.resolve( wiki.articleBase, wiki.info.general.server ),
    }
-    return Promise.coroutine( function * () {
+    await new MainPage().process()
-        yield new MainPage().process()
+    await new FavIcon().process()
        yield new FavIcon().process()
-        for ( let i in metadata ) {
+    for ( let i in metadata ) {
-            yield new Metadata( i, metadata[i] ).process()
+        await new Metadata( i, metadata[i] ).process()
-        }
+    }
    }) ()
 }
-function saveMimeTypes () {
+async function saveMimeTypes () {
-    return Promise.coroutine( function * () {
+    for ( let i=0, li=mimeIds.length; i < li; i++ ) {
-        for ( let i=0, li=mimeIds.length; i < li; i++ ) {
+        await indexerDb.run(
-            yield indexerDb.run(
+            'INSERT INTO mimeTypes (id, value) VALUES (?,?)',
-                'INSERT INTO mimeTypes (id, value) VALUES (?,?)',
+            [ i + 1, mimeIds[ i ]]
-                [ i + 1, mimeIds[ i ]]
+        )
-            )
+    }
        }
    }) ()
 }
 function batchRedirects ( pageInfos ) {
@ -998,107 +988,103 @@ function batchRedirects ( pageInfos ) {
    })
 }
-function batchPages ( nameSpace ) {
+async function batchPages ( nameSpace ) {
    const queryPageLimit = 500
    const queryMaxTitles = 50
-    return Promise.coroutine( function* () {
+    const exclude = command.exclude ?
-        const exclude = command.exclude ?
+        new RegExp( command.exclude ) :
-            new RegExp( command.exclude ) :
+        { test: () => false }
-            { test: () => false }
+    const query = {
-        const query = {
+        action: 'query',
-            action: 'query',
+        prop: 'info',
-            prop: 'info',
+        inprop: 'url',
-            inprop: 'url',
+    }
    Object.assign(
        query,
        nameSpace == null ?
        {   titles: command.titles } :
        {
            generator: 'allpages',
            gapnamespace: nameSpace,
            gaplimit: queryPageLimit,
            rawcontinue: '',
        }
-        Object.assign(
+    )
-            query,
+
-            nameSpace == null ?
+    let continueFrom = ''
-            {   titles: command.titles } :
+    while ( true ) {
-            {
+        await indexerDb.run(
-                generator: 'allpages',
+            'INSERT OR REPLACE INTO continue (id, "from") VALUES (1, ?)',
-                gapnamespace: nameSpace,
+            [ continueFrom ]
                gaplimit: queryPageLimit,
                rawcontinue: '',
            }
        )
        if ( continueFrom == null )
            break
-        let continueFrom = ''
+        await indexerDb.run( 'BEGIN' )
        while ( true ) {
            yield indexerDb.run(
                'INSERT OR REPLACE INTO continue (id, "from") VALUES (1, ?)',
                [ continueFrom ]
            )
            if ( continueFrom == null )
                break
-            yield indexerDb.run( 'BEGIN' )
+        const resp = await api( query )
-
+        let pages = {}
-            const resp = yield api( query )
+        try {
-            let pages = {}
+            pages = resp.query.pages
-            try {
+            //~ log( '*pages', pages )
                pages = resp.query.pages
                //~ log( '*pages', pages )
            }
            catch (e) {
                log( 'getPages', 'NO PAGES' )
            }
            let redirects = []
            const done = Object.keys( pages ).map( key => {
                if ( parseInt( key ) < 0 ) { // no such page
                    return null
                }
                const pageInfo = pages[ key ]
                if ( pageInfo.redirect != null ) {
                    log( '>' , pageInfo.title )
                    redirects.push( pageInfo )
                    if ( redirects.length == queryMaxTitles ) {
                        const res = batchRedirects( redirects )
                        redirects = []
                        return res
                    }
                    return null
                }
                if ( ! command.pages || exclude.test( pageInfo.title )) {
                    log( 'x', pageInfo.title )
                    return null
                }
                log( '#', pageInfo.title )
                return new Article( pageInfo ).process()
            })
            done.push( batchRedirects( redirects ))
            yield Promise.all( done )
            yield indexerDb.run( 'COMMIT' )
            continueFrom = null
            try {
                const continueKey = Object.keys( resp[ 'query-continue' ].allpages )[ 0 ]
                continueFrom = resp[ 'query-continue' ].allpages[ continueKey ]
                query[ continueKey ] = continueFrom
                log( '...', continueFrom )
            }
            catch ( e ) {
                log( 'getPages', 'No continue key' )
            }
        }
-    })()
+        catch (e) {
            log( 'getPages', 'NO PAGES' )
        }
        let redirects = []
        const done = Object.keys( pages ).map( key => {
            if ( parseInt( key ) < 0 ) { // no such page
                return null
            }
            const pageInfo = pages[ key ]
            if ( pageInfo.redirect != null ) {
                log( '>' , pageInfo.title )
                redirects.push( pageInfo )
                if ( redirects.length == queryMaxTitles ) {
                    const res = batchRedirects( redirects )
                    redirects = []
                    return res
                }
                return null
            }
            if ( ! command.pages || exclude.test( pageInfo.title )) {
                log( 'x', pageInfo.title )
                return null
            }
            log( '#', pageInfo.title )
            return new Article( pageInfo ).process()
        })
        done.push( batchRedirects( redirects ))
        await Promise.all( done )
        await indexerDb.run( 'COMMIT' )
        continueFrom = null
        try {
            const continueKey = Object.keys( resp[ 'query-continue' ].allpages )[ 0 ]
            continueFrom = resp[ 'query-continue' ].allpages[ continueKey ]
            query[ continueKey ] = continueFrom
            log( '...', continueFrom )
        }
        catch ( e ) {
            log( 'getPages', 'No continue key' )
        }
    }
 }
-function getPages () {
+async function getPages () {
-    return Promise.coroutine( function* () {
+    if ( command.titles ) {
-        if ( command.titles ) {
+        log( 'Titles', command.titles )
-            log( 'Titles', command.titles )
+        await batchPages()
-            yield batchPages()
+    } else {
-        } else {
+        wiki.nameSpaces.init( command.nameSpaces )
-            wiki.nameSpaces.init( command.nameSpaces )
+        for ( let nameSpace of wiki.nameSpaces ) {
-            for ( let nameSpace of wiki.nameSpaces ) {
+            log( 'Name Space', nameSpace )
-                log( 'Name Space', nameSpace )
+            await batchPages( nameSpace )
                yield batchPages( nameSpace )
            }
        }
-        log( '**************** done' )
+    }
-    })()
+    log( '**************** done' )
 }
 function loadCss( dom ) {