From 7111f132c9a03f9b4462a068f5f5d8db5bc32572 Mon Sep 17 00:00:00 2001 From: Sean Takats Date: Wed, 13 Jun 2007 16:54:53 +0000 Subject: [PATCH] Adds Elena's Ancestry.com translator --- scrapers.sql | 215 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 213 insertions(+), 2 deletions(-) diff --git a/scrapers.sql b/scrapers.sql index 822b70058..bdc94e8d6 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -1,4 +1,4 @@ --- 234 +-- 235 -- ***** BEGIN LICENSE BLOCK ***** -- @@ -22,7 +22,7 @@ -- Set the following timestamp to the most recent scraper update date -REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2007-06-13 01:00:00')); +REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2007-06-13 20:00:00')); REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b4.r1', '', '2007-03-21 15:26:54', '1', '100', '4', 'Amazon.com', 'Sean Takats', '^https?://(?:www\.)?amazon', 'function detectWeb(doc, url) { @@ -217,6 +217,217 @@ REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b Zotero.wait(); }'); +REPLACE INTO translators VALUES ('0dda3f89-15de-4479-987f-cc13f1ba7999', '1.0.0b3r1', '', '2007-06-13 20:00:00', '0', '100', '4', 'Ancestry.com US Federal Census', 'Elena Razlogova', '^https?://search.ancestry.com/(.*)usfedcen|1890orgcen', +'function detectWeb(doc, url) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var result = doc.evaluate(''//div[@class="g_container"]/div[@class="g_panelWrap"]/div[@class="g_panelCore"]/div[@class="s_container"]/div[@class="p_rsltList"]'', doc, nsResolver, + XPathResult.ANY_TYPE, null).iterateNext(); + + var rows = doc.evaluate(''//div[@class="g_container"]/div[@class="g_panelWrap"]/div[@class="g_panelCore"]/div[@class="s_container"]/div[@class="p_rsltList"]/table/tbody/tr[@class="tblrow record"]'', + doc, nsResolver, XPathResult.ANY_TYPE, null); + var row; + while(row = rows.iterateNext()) { + links = doc.evaluate(''.//a'', row, nsResolver, XPathResult.ANY_TYPE, null); + var linkNo=0; + while(link=links.iterateNext()) { + linkNo=linkNo+1; + } + break; + } + + if(result && linkNo == 2) { + return "multiple"; + } else { + var loggedIn = doc.evaluate(''//a[@id="_ctl16__ctl4_m_logout"]|//a[@id="_ctl18__ctl4_m_logout"]'', doc, nsResolver, + XPathResult.ANY_TYPE, null).iterateNext(); + + checkURL = doc.location.href.replace("pf=", "").replace("&h=", ""); + if(doc.location.href == checkURL && loggedIn) { + return "bookSection"; + } + } +}', +'// this US Federal Census scraper is a hack - so far there is no proper item type in Zotero for this kind of data (added to trac as a low priority ticket) +// this scraper creates proper citation for the census as a whole (should be cited as book) +// but also adds name, city, and state for a particular individual to the citation to make scanning for names & places easier in the middle pane +// (that''s why the resulting item type is a book section) +// it also adds all searchable text as a snapshot and a scan of the census record as an image + +function scrape(doc) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + // get initial census data; a proper census record item type should have separate fields for all of these except perhaps dbid + var info = doc.evaluate(''//div[@class="g_container"]/div[@class="g_panelWrap"]/div[@class="g_panelCore"]/div[@class="g_right"]/div[@class="g_box"]/p/a'', + doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + + if(info) { + + info = info.toString(); + var data = new Array(); + var parts = info.split(/[?&]/); + for each(var part in parts) { + var index = part.indexOf("="); + if(index !== -1) { + data[part.substr(0, index)] = part.substr(index+1); + } + } + + if(data.ln) { + var lastName = data.ln.replace(/\+/g, " "); + var firstName = data.fn.replace(/\+/g, " "); + } else { + var lastName = data.fn.replace(/\+/g, " "); + var firstName = ""; + } + var dOb = data.by; // this does not get saved yet because no field is available; the info is in the snapshot + if(data.rfd) { + var yearRe = /([0-9]{4})/; + var m = yearRe.exec(data.rfd); + if(m) { + var year = m[1]; + } + } else { var year = data.ry; } + if (year == 1890) { + var yearDb = "1890orgcen"; + } else { var yearDb = year+"usfedcen"; } + var state = data.rs.replace(/\+/g, " "); + var county = data.rcnty.replace(/\+/g, " "); // this does not get saved yet because no field is available; the info is in the snapshot + var city = data.rcty.replace(/\+/g, " "); + var dbid = data.dbid; + } + + // set census number for citation - let me know if this can be done in a better way + var censusYear = 0; + var censusNo = ""; + var censusNos = new Array("1790", "First", "1800", "Second", "1810", "Third", "1820", "Fourth", "1830", "Fifth", "1840", "Sixth", "1850", "Seventh", "1860", "Eighth", "1870", "Ninth", + "1880", "Tenth", "1890", "Eleventh", "1900", "Twelfth", "1910", "Thirteenth", "1920", "Fourteenth", "1930", "Fifteenth") + for(var i in censusNos) { + if(censusYear == 1) { censusNo = censusNos[i] }; + if(censusNos[i] == year) { censusYear = 1 } else {censusYear= 0 }; + } + + //begin adding item + var newItem = new Zotero.Item("bookSection"); + newItem.title = city+", "+state; // this is not proper citation but is needed to easily scan for placenames in middle pane + newItem.publicationTitle = censusNo+" Census of the United States, "+year; + newItem.publisher = "National Archives and Records Administration"; + newItem.place = "Washington, DC"; + newItem.date = year; + + // get snapshot with all searchable text and a simplified link to the record for the URL field + var snapshotRe = /recid=([0-9]+)/; + var m = snapshotRe.exec(doc.location.href); + if(m) { + snapshotURL = "http://search.ancestry.com/cgi-bin/sse.dll?db="+yearDb+"&indiv=1&pf=1&recid="+m[1]; + newItem.attachments.push({title:"Ancestry.com Snapshot", mimeType:"text/html", url:snapshotURL, snapshot:true}); + cleanURL = "http://search.ancestry.com/cgi-bin/sse.dll?indiv=1&db="+yearDb+"&recid="+m[1]; + newItem.url = cleanURL; + } + + // add particular individual being surveyed as contributor - this is not proper citation but is needed so one could easily scan for names in middle pane + var creator = new Array(); + creator.firstName = firstName; + creator.lastName = lastName; + creator.creatorType = "contributor"; + newItem.creators.push(creator); + + //add proper author for citation + var creator = new Array(); + creator.lastName = "United States of America, Bureau of the Census"; + creator.creatorType = "author"; + newItem.creators.push(creator); + + // get scan of the census image + var scanInfo = doc.evaluate(''//div[@class="g_container"]/div[@class="g_panelWrap"]/div[@class="g_panelCore"]/div[@class="g_main"]/div[@class="g_outerBox"]/div[@class="s_container"]/div[@class="g_box2"]/table[@class="p_recTable"]/tbody/tr/td[2][@class="recordTN"]/a'', + doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + + if(scanInfo) { + var scanRe = /iid=([A-Z0-9_-]+)/; + var m = scanRe.exec(scanInfo); + if(m) { + scanURL = "http://content.ancestry.com/Browse/print_u.aspx?dbid="+dbid+"&iid="+m[1]; + Zotero.debug("scan url: " + scanURL); + } + } + + if(scanURL){ + Zotero.Utilities.HTTP.doGet(scanURL, function(text) { + Zotero.debug("running doGet"); + Zotero.debug(text); + var imageRe = /950 src="([^"]+)"/; + var m = imageRe.exec(text); + if(m) { + imageURL = m[1]; + Zotero.debug("image url: " + imageURL); + newItem.attachments.push({title:"Ancestry.com Image", mimeType:"image/jpeg", url:imageURL, snapshot:true}); + } + + newItem.complete(); + Zotero.done(); + }); + } else { + newItem.complete(); + Zotero.done(); + } +} + +function doWeb(doc, url) { + var resultsRegexp = /recid=/; + if(resultsRegexp.test(url)) { + scrape(doc); + } else { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + // get census year for links to items + var yearRe = /db=([0-9]+)/; + var m = yearRe.exec(doc.location.href); + if(m) { + year = m[1]; + } + + //select items + var items = new Array(); + var listElts = doc.evaluate(''//div[@class="g_container"]/div[@class="g_panelWrap"]/div[@class="g_panelCore"]/div[@class="s_container"]/div[@class="p_rsltList"]/table/tbody/tr[@class="tblrowalt record"] | //div[@class="g_container"]/div[@class="g_panelWrap"]/div[@class="g_panelCore"]/div[@class="s_container"]/div[@class="p_rsltList"]/table/tbody/tr[@class="tblrow record"]'', + doc, nsResolver, XPathResult.ANY_TYPE, null); + var recid; + var link; + var name; + while (listElt = listElts.iterateNext()) { + recInfo = doc.evaluate(''.//a'', listElt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + var recidRe = /^javascript:go[0-9]+_([0-9]+)/; + var m = recidRe.exec(recInfo); + if(m) { + recid = m[1]; + } + link = "http://search.ancestry.com/cgi-bin/sse.dll?indiv=1&db="+year+"usfedcen&recid="+recid; + name = doc.evaluate(''.//span[@class="srchHit"]'', listElt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; + items[link] = Zotero.Utilities.cleanString(name); + } + + items = Zotero.selectItems(items); + if(!items) return true; + + var urls = new Array(); + for(var i in items) { + urls.push(i); + } + + Zotero.Utilities.processDocuments(urls, scrape, function() { Zotero.done(); }); + Zotero.wait(); + + } +}'); + REPLACE INTO translators VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '1.0.0b3.r1', '', '2007-03-24 22:20:00', 1, 100, 4, 'OCLC WorldCat FirstSearch', 'Simon Kornblith', '^https?://(?:new)?firstsearch\.oclc\.org[^/]*/WebZ/', 'function detectWeb(doc, url) { var detailRe = /FirstSearch: [\w ]+ Detailed Record/;