diff --git a/scrapers.sql b/scrapers.sql index bdc94e8d6..fbd1c6d0a 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -1,4 +1,4 @@ --- 235 +-- 236 -- ***** BEGIN LICENSE BLOCK ***** -- @@ -22,7 +22,7 @@ -- Set the following timestamp to the most recent scraper update date -REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2007-06-13 20:00:00')); +REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2007-06-15 20:00:00')); REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b4.r1', '', '2007-03-21 15:26:54', '1', '100', '4', 'Amazon.com', 'Sean Takats', '^https?://(?:www\.)?amazon', 'function detectWeb(doc, url) { @@ -217,6 +217,141 @@ REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b Zotero.wait(); }'); +REPLACE INTO translators VALUES ('aee2323e-ce00-4fcc-a949-06eb1becc98f', '1.0.0b4r1', '', '2007-06-15 20:00:00', '0', '100', '4', 'Epicurious', 'Sean Takats', '^https?://www\.epicurious\.com/recipes/(?:find/results|recipe_views/views/)', +'function detectWeb(doc, url){ + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var xpath = ''//div[@class="recipeDetailLeftDiv"][@id="ingredients"]''; + var multxpath = ''//div[@id="left"]/table[@class="searchresults"]/tbody/tr''; + + if(doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()){ + return "document"; + } else if (doc.evaluate(multxpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()){ + return "multiple"; + } + +}', +'function cleanText(s){ + s = s.replace(/\n+/g, "\n"); + s = s.replace(/(\n|\r)\t+/g, "\n"); + s = s.replace(/\t+/g, " "); + s = s.replace(" ", "", "g"); + return s; +} + +function scrape(doc){ + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var newItem = new Zotero.Item("document"); + + var xpath = ''//title''; + var title = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; + title = title.substring(0, title.indexOf(" Recipe at Epicurious.com")); + newItem.title = title; + + var elmt; + + xpath = ''//div[@id="sourceInfo"]/p[@class="source"]''; + var elmts = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null); + if (elmt = elmts.iterateNext()){ + var authordate = elmt.textContent; + var authordates = authordate.split(","); + newItem.creators.push(Zotero.Utilities.cleanAuthor(authordates[0], "contributor", true)); + newItem.date = authordates[1]; + while (elmt = elmts.iterateNext()){ + Zotero.debug("looping?"); + Zotero.debug(elmt.textContent); + newItem.creators.push(Zotero.Utilities.cleanAuthor(elmt.textContent, "contributor", false)); + } + } + + xpath = ''//div[@class="recipeDetailLeftDiv"][@id="intro"]/p''; + if (elmt = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()){ + var abstract = elmt.textContent; + abstract = Zotero.Utilities.cleanString(abstract); + newItem.abstractNote = abstract; + } + + xpath = ''//div[@class="recipeDetailLeftDiv"][@id="ingredients"]''; + if (elmt = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()){ + var ingredients = elmt.textContent; + ingredients = Zotero.Utilities.superCleanString(ingredients); + ingredients = cleanText(ingredients); + } + xpath = ''//div[@class="recipeDetailLeftDiv"][@id="preparation"]''; + if (elmt = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()){ + var prep = elmt.textContent; + prep = Zotero.Utilities.superCleanString(prep); + prep = cleanText(prep); + prep = prep.replace(/\n/g, "\n\n"); + } + xpath = ''//div[@id="servingInfo"]''; + if (elmt = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()){ + var serving = elmt.textContent; + serving = Zotero.Utilities.superCleanString(serving); + serving = cleanText(serving); + } +// notestring = ingredients + "\n\n" + prep + "\n\n" + serving; +// newItem.notes.push({note:notestring}); + newItem.notes.push({note:ingredients}); + newItem.notes.push({note:prep}); + newItem.notes.push({note:serving}); + + var url = doc.location.href; + + var snapshotURL = url.replace("/views/", "/printer_friendly/"); + newItem.attachments.push({title:"Epicurious.com Snapshot", mimeType:"text/html", url:snapshotURL, snapshot:true}); + newItem.url = url; + newItem.attachments.push({title:"Epicurious.com Link", snapshot:false, mimeType:"text/html", url:url}); + + newItem.complete(); +} + +function doWeb(doc, url){ + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var singxpath = ''//div[@class="recipeDetailLeftDiv"][@id="ingredients"]''; + var multxpath = ''//div[@id="left"]/table[@class="searchresults"]/tbody/tr''; + if(doc.evaluate(singxpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()){ + // single recipe page + scrape(doc, url); + } else if (doc.evaluate(multxpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()){ + var items = new Object(); + var elmtxpath = ''//div[@id="left"]/table[@class="searchresults"]/tbody/tr/td[@class="pd2"]/a[@class="hed"]''; + var elmts = doc.evaluate(elmtxpath, doc, nsResolver, XPathResult.ANY_TYPE, null); + var elmt; + while (elmt = elmts.iterateNext()) { + var title = elmt.textContent; + var link = elmt.href; + if (title && link){ + items[link] = title; + } + } + + var items = Zotero.selectItems(items); + if(!items) { + return true; + } + + var urls = new Array(); + for(var i in items) { + urls.push(i); + } + + Zotero.Utilities.processDocuments(urls, scrape, function() { Zotero.done(); }); + Zotero.wait(); + } +}'); + REPLACE INTO translators VALUES ('0dda3f89-15de-4479-987f-cc13f1ba7999', '1.0.0b3r1', '', '2007-06-13 20:00:00', '0', '100', '4', 'Ancestry.com US Federal Census', 'Elena Razlogova', '^https?://search.ancestry.com/(.*)usfedcen|1890orgcen', 'function detectWeb(doc, url) { var namespace = doc.documentElement.namespaceURI; @@ -3636,6 +3771,820 @@ function doWeb() { getAllIds(); }'); +REPLACE INTO translators VALUES ('3af43735-36d3-46ae-9ca8-506ff032b0d3', '1.0.0b4.r1', '', '2007-06-15 20:00:00', '0', '100', '4', 'HeinOnline', 'Bill McKinney', 'http:\/\/heinonline\.org\/HOL\/Page\?handle\=hein\.journals\/.+', +'function detectWeb(doc, url) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var re = /http:\/\/heinonline\.org\/HOL\/Page\?handle\=hein\.journals\/.+/ + if(re.test(url)) { + return "book"; + } else { + var aTags = doc.getElementsByTagName("a"); + for(var i=0; i 0) { + for (var i = 0; i < p[4].options.length; i++) { + if (p[4].options[ i ].selected) { + selectedPage = p[4].options[i].value; + pageNum = p[4].options[i].innerHTML; + newItem.pages = pageNum.replace(/^Page\s+/,"") + "-"; + } + } + } + + + // get handle + var handle=""; + var handleRe = /handle=([^\&]+)\&/ + var handleMatch = handleRe.exec(doc.location.href); + if (handleMatch) { + handle = handleMatch[1]; + } + + // fetch citation + var url = "http://heinonline.org/HOL/citation-info?handle="+handle+"&id="+selectedPage+"&rand=12345&collection=journals"; + Zotero.Utilities.HTTP.doGet(url, function(text) { + + var tmpTxt = text; + var citeRe = /(\d+)\s+(.+)\s+(\d+)\s+\(([^\)]+)\)\s+
\s+([^;]+)(;\s.+[\S])/ + var citeMatch = citeRe.exec(tmpTxt) + if (citeMatch) { + + newItem.volume = citeMatch[1]; + //newItem.issue= citeMatch[3]; + newItem.date = citeMatch[4]; + newItem.journalAbbreviation = citeMatch[2]; + newItem.title = citeMatch[5]; + + var tmpAuthors = citeMatch[6]; + var authors = tmpAuthors.split(";"); + for (i=1;i -1) { + newItem.history = "Source: " + boldTags[i].nextSibling.nodeValue; + } + if (s.indexOf("Authority:") > -1) { + newItem.extra = "Authority: " + boldTags[i].nextSibling.nodeValue; + } + } + + newItem.complete(); +} + +function doWeb(doc, url) { + var re = new RegExp("http://ecfr\.gpoaccess\.gov/cgi/t/text/text-idx.+"); + if(re.test(doc.location.href)) { + scrape(doc); + } else { + var items = Zotero.Utilities.getItemArray(doc, doc,"http://ecfr\.gpoaccess\.gov/cgi/t/text/text-idx.+"); + items = Zotero.selectItems(items); + + if(!items) { + return true; + } + + var uris = new Array(); + for(var i in items) { + uris.push(i); + } + + Zotero.Utilities.processDocuments(uris, function(doc) { scrape(doc) }, + function() { Zotero.done(); }, null); + + Zotero.wait(); + } +}'); + +REPLACE INTO translators VALUES ('5ed5ab01-899f-4a3b-a74c-290fb2a1c9a4', '1.0.0b4.r1', '', '2007-06-15 20:00:00', '0', '100', '4', 'AustLII and NZLII', 'Bill McKinney', 'http:\/\/www\.(?:austlii\.edu\.au|nzlii\.org)\/(?:\/cgi-bin\/disp\.pl\/)?(?:au|nz)\/cases\/.+', +'function detectWeb(doc, url) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var austliiRegexp = /^http:\/\/www\.(?:austlii\.edu\.au|nzlii\.org)\/(?:\/cgi-bin\/disp\.pl\/)?(?:au|nz)\/cases\/.+/ + if(austliiRegexp.test(url)) { + return "book"; + } else { + var aTags = doc.getElementsByTagName("a"); + for(var i=0; i 0) { + var tmp = panel[0].innerHTML; + newItem.creators.push({lastName:tmp, creatorType:"judge", fieldMode:true}); + + } + // citation + var cite = doc.getElementsByTagName("CITATION"); + if (cite.length > 0) { + var tmpc = cite[0].childNodes[0].innerHTML; + newItem.notes.push({note:tmpc}); + } + + newItem.complete(); +} + +function doWeb(doc, url) { + var liiRegexp= /http:\/\/www\.bailii\.org(?:\/cgi\-bin\/markup\.cgi\?doc\=)?\/\w+\/cases\/.+/ + if(liiRegexp.test(url)) { + scrape(doc); + } else { + + var items = Zotero.Utilities.getItemArray(doc, doc, liiRegexp); + items = Zotero.selectItems(items); + + if(!items) { + return true; + } + + var urls = new Array(); + for(var i in items) { + urls.push(i); + } + + Zotero.Utilities.processDocuments(urls, scrape, function() { Zotero.done(); }); + Zotero.wait(); + } +}'); + +REPLACE INTO translators VALUES ('84799379-7bc5-4e55-9817-baf297d129fe', '1.0.0b4.r1', '', '2007-06-15 20:00:00', '0', '100', '4', 'CanLII', 'Bill McKinney', 'http:\/\/www\.canlii\.org\/en\/[^\/]+\/[^\/]+\/doc\/.+', +'function detectWeb(doc, url) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var canLiiRegexp = /http:\/\/www\.canlii\.org\/en\/[^\/]+\/[^\/]+\/doc\/.+/ + if(canLiiRegexp .test(url)) { + return "book"; + } else { + var aTags = doc.getElementsByTagName("a"); + for(var i=0; i 0) { + var tmpDis = dis[0].innerHTML; + tmpDis = tmpDis.replace(/\s+/g, " "); + newItem.title = newItem.title + " (" + tmpDis + ")"; + newItem.caseName= newItem.caseName + " (" + tmpDis + ")"; + + } + + + // parse citation into parts so that bluebook can be constructed + var cite = doc.getElementsByTagName("CASENUMBER"); + if (cite.length > 0) { + var citeRegex = /([0-9]+)\s+U\.S\.\s+([0-9]+)/; + var citeMatch = citeRegex.exec(cite[0].innerHTML); + if (citeMatch) { + caselawSourceVolume = citeMatch[1]; + newItem.reporterVolume = citeMatch[1]; + caselawSourceStartPage = citeMatch[2]; + newItem.firstPage = citeMatch[2]; + } + } + + // look for offcite span element + var spanTags = doc.getElementsByTagName("span"); + if (spanTags.length > 0) { + for(var i=0; i 0) { + var tmpNotice= notice [0].innerHTML; + tmpNotice= tmpNotice.replace(/\s+/g, " "); + newItem.notes.push({note:tmpNotice}); + } + + newItem.complete(); +} + +function doWeb(doc, url) { + var liiRegexp = /http:\/\/www\.law\.cornell\.edu\/supct\/html\/.+/ + if(liiRegexp.test(url)) { + scrape(doc); + } else { + + var items = Zotero.Utilities.getItemArray(doc, doc, liiRegexp); + items = Zotero.selectItems(items); + + if(!items) { + return true; + } + + var urls = new Array(); + for(var i in items) { + urls.push(i); + } + + Zotero.Utilities.processDocuments(urls, scrape, function() { Zotero.done(); }); + Zotero.wait(); + } +}'); + +REPLACE INTO translators VALUES ('232e24fe-2f68-44fc-9366-ecd45720ee9e', '1.0.0b4.r1', '', '2007-06-15 20:00:00', '0', '100', '4', 'Patents - USPTO', 'Bill McKinney', '^http://patft\.uspto\.gov/netacgi/nph-Parser.+', +'function detectWeb(doc, url) { + var re = new RegExp("^http://patft\.uspto\.gov/netacgi/nph-Parser"); + if(re.test(doc.location.href)) { + return "book"; + } else { + return "multiple"; + } +}', +'function get_nextsibling(n) + { + var x=n.nextSibling; + while (x.nodeType!=1) + { + x=x.nextSibling; + } + return x; +} + +function scrape(doc) { + + var newItem = new Zotero.Item("patent"); + newItem.url = doc.location.href; + var extraText = new String(); + var tmpStr = new String(); + var tmpRefs = ""; + var tmpTitle = doc.title; + + var fontTags = doc.getElementsByTagName("font"); + for(var i=0; i]+>/g, ""); + newItem.title = tmpTitle; + + var cellTags = doc.getElementsByTagName("td"); + for(var i=0; i -1) { + + tmpStr = cellTags[i+1].childNodes[0].innerHTML; + tmpStr = tmpStr.replace(/<[^>]+>/gi, ""); + tmpStr = tmpStr.replace(/,/gi, ""); + newItem.patentNumber = tmpStr; + + tmpStr = cellTags[i+3].innerHTML; + tmpStr = tmpStr.replace(/<[^>]+>/gi, ""); + newItem.issueDate = tmpStr; + continue; + } + if (s.indexOf("Assignee") > -1) { + tmpStr = cellTags[i+1].innerHTML; + tmpStr = tmpStr.replace(/<\/?\w+>/gi, ""); + newItem.assignee = tmpStr; + continue; + } + if (s.indexOf("Inventors") > -1) { + tmpStr = cellTags[i+1].innerHTML; + + var inventors = tmpStr.split(/,/ig); + for (var j=0; j/gi, ""); + tmpInventor = tmpInventor.replace(/\([^\)]+\)/gi, ""); + tmpInventor = tmpInventor.replace(/^\s+/gi, ""); + + var names = tmpInventor.split(";"); + if (names) { + var lname = names[0]; + var fname = names[1]; + lname = lname.replace(/^\s+/gi, ""); + lname = lname.replace(/\s+$/gi, ""); + fname= fname.replace(/^\s+/gi, ""); + fname= fname.replace(/\s+$/gi, ""); + newItem.creators.push({lastName:lname, firstName:fname, creatorType:"inventor"}); + } + } + continue; + } + + // references + if (s.indexOf(" -1) { + tmpRefs = tmpRefs + cellTags[i].childNodes[0].innerHTML + " "; + } + if (s.indexOf(" -1) { + tmpRefs = tmpRefs + cellTags[i].childNodes[0].innerHTML + " "; + } + } + + var centerTags = doc.getElementsByTagName("center"); + for(var i=0; i -1) { + //newItem.extra = "ok"; + var el = get_nextsibling(centerTags[i]); + newItem.abstract= el.innerHTML; + } + + } + + newItem.references = tmpRefs; + newItem.complete(); +} + +function doWeb(doc, url) { + var re = new RegExp("^http://patft\.uspto\.gov/netacgi/nph-Parser.+"); + if(re.test(doc.location.href)) { + scrape(doc); + } else { + var items = Zotero.Utilities.getItemArray(doc, doc, "^http://patft\.uspto\.gov/netacgi/nph-Parser.+"); + items = Zotero.selectItems(items); + + if(!items) { + return true; + } + + var uris = new Array(); + for(var i in items) { + uris.push(i); + } + + Zotero.Utilities.processDocuments(uris, function(doc) { scrape(doc) }, + function() { Zotero.done(); }, null); + + Zotero.wait(); + } +}'); + REPLACE INTO translators VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '1.0.0b3.r1', '', '2007-05-15 12:00:00', '1', '100', '4', 'Google Books', 'Simon Kornblith', '^http://books\.google\.[a-z]+/books\?(.*id=.*|.*q=.*)', 'function detectWeb(doc, url) { var re = new RegExp(''^http://books\\.google\\.[a-z]+/books\\?id=([^&]+)'', ''i'');