From fd2052e63c289d08a4e018169460c89cace90710 Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Sat, 24 Jun 2006 17:33:35 +0000 Subject: [PATCH] Search results scraping for PubMed and Google Books. This marks the end of what I can do with respect to #15 until I'm at home or CHNM, where I'll have access to the gated collections. --- scrapers.sql | 96 +++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 72 insertions(+), 24 deletions(-) diff --git a/scrapers.sql b/scrapers.sql index 7a4614075..3bef2614e 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -1,7 +1,7 @@ --- 15 +-- 16 -- Set the following timestamp to the most recent scraper update date -REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-24 11:22:00')); +REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-24 13:31:00')); REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-22 22:58:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; @@ -1747,23 +1747,52 @@ for(i in elmts) { model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);'); -REPLACE INTO "scrapers" VALUES('fcf41bed-0cbc-3704-85c7-8062a0068a7a', '2006-06-18 11:19:00', 'PubMed Scraper', 'Simon Kornblith', '^http://www\.ncbi\.nlm\.nih\.gov/entrez/query\.fcgi\?(?:.*db=PubMed.*list_uids=[0-9]|.*list_uids=[0-9].*db=PubMed)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; +REPLACE INTO "scrapers" VALUES('fcf41bed-0cbc-3704-85c7-8062a0068a7a', '2006-06-24 13:17:00', 'PubMed Scraper', 'Simon Kornblith', '^http://www\.ncbi\.nlm\.nih\.gov/entrez/query\.fcgi\?(?:.*db=PubMed.*list_uids=[0-9]|.*list_uids=[0-9].*db=PubMed|.*db=PubMed.*CMD=search|.*CMD=search.*db=PubMed)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; -function mapRDF(text, rdfUri) { +function mapRDF(uri, text, rdfUri) { if(text != "") { model.addStatement(uri, rdfUri, text, true); } } var uri = doc.location.href; -var newUri = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=PubMed&retmode=xml&rettype=citation&id="; +var ids = new Array(); var idRegexp = /[\?\&]list_uids=([0-9\,]+)/; -var m = idRegexp.exec(uri); -newUri += m[1]; +var m = idRegexp.exec(uri); +if(m) { + ids.push(m[1]); +} else { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var items = new Array(); + var tableRows = utilities.gatherElementsOnXPath(doc, doc, ''//div[@class="ResultSet"]/table/tbody'', nsResolver); + // Go through table rows + for(var i=0; i]*>/, "").replace(/<\?xml[^>]*\?>/, ""); @@ -1773,6 +1802,7 @@ utilities.HTTPUtilities.doGet(newUri, null, function(text) { for(var i=0; i