Adds Elena's Ancestry.com translator
This commit is contained in:
parent
b4bfa6cf1a
commit
7111f132c9
215
scrapers.sql
215
scrapers.sql
|
@ -1,4 +1,4 @@
|
|||
-- 234
|
||||
-- 235
|
||||
|
||||
-- ***** BEGIN LICENSE BLOCK *****
|
||||
--
|
||||
|
@ -22,7 +22,7 @@
|
|||
|
||||
|
||||
-- Set the following timestamp to the most recent scraper update date
|
||||
REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2007-06-13 01:00:00'));
|
||||
REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2007-06-13 20:00:00'));
|
||||
|
||||
REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b4.r1', '', '2007-03-21 15:26:54', '1', '100', '4', 'Amazon.com', 'Sean Takats', '^https?://(?:www\.)?amazon',
|
||||
'function detectWeb(doc, url) {
|
||||
|
@ -217,6 +217,217 @@ REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b
|
|||
Zotero.wait();
|
||||
}');
|
||||
|
||||
REPLACE INTO translators VALUES ('0dda3f89-15de-4479-987f-cc13f1ba7999', '1.0.0b3r1', '', '2007-06-13 20:00:00', '0', '100', '4', 'Ancestry.com US Federal Census', 'Elena Razlogova', '^https?://search.ancestry.com/(.*)usfedcen|1890orgcen',
|
||||
'function detectWeb(doc, url) {
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == ''x'') return namespace; else return null;
|
||||
} : null;
|
||||
|
||||
var result = doc.evaluate(''//div[@class="g_container"]/div[@class="g_panelWrap"]/div[@class="g_panelCore"]/div[@class="s_container"]/div[@class="p_rsltList"]'', doc, nsResolver,
|
||||
XPathResult.ANY_TYPE, null).iterateNext();
|
||||
|
||||
var rows = doc.evaluate(''//div[@class="g_container"]/div[@class="g_panelWrap"]/div[@class="g_panelCore"]/div[@class="s_container"]/div[@class="p_rsltList"]/table/tbody/tr[@class="tblrow record"]'',
|
||||
doc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||
var row;
|
||||
while(row = rows.iterateNext()) {
|
||||
links = doc.evaluate(''.//a'', row, nsResolver, XPathResult.ANY_TYPE, null);
|
||||
var linkNo=0;
|
||||
while(link=links.iterateNext()) {
|
||||
linkNo=linkNo+1;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if(result && linkNo == 2) {
|
||||
return "multiple";
|
||||
} else {
|
||||
var loggedIn = doc.evaluate(''//a[@id="_ctl16__ctl4_m_logout"]|//a[@id="_ctl18__ctl4_m_logout"]'', doc, nsResolver,
|
||||
XPathResult.ANY_TYPE, null).iterateNext();
|
||||
|
||||
checkURL = doc.location.href.replace("pf=", "").replace("&h=", "");
|
||||
if(doc.location.href == checkURL && loggedIn) {
|
||||
return "bookSection";
|
||||
}
|
||||
}
|
||||
}',
|
||||
'// this US Federal Census scraper is a hack - so far there is no proper item type in Zotero for this kind of data (added to trac as a low priority ticket)
|
||||
// this scraper creates proper citation for the census as a whole (should be cited as book)
|
||||
// but also adds name, city, and state for a particular individual to the citation to make scanning for names & places easier in the middle pane
|
||||
// (that''s why the resulting item type is a book section)
|
||||
// it also adds all searchable text as a snapshot and a scan of the census record as an image
|
||||
|
||||
function scrape(doc) {
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == ''x'') return namespace; else return null;
|
||||
} : null;
|
||||
|
||||
// get initial census data; a proper census record item type should have separate fields for all of these except perhaps dbid
|
||||
var info = doc.evaluate(''//div[@class="g_container"]/div[@class="g_panelWrap"]/div[@class="g_panelCore"]/div[@class="g_right"]/div[@class="g_box"]/p/a'',
|
||||
doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
||||
|
||||
if(info) {
|
||||
|
||||
info = info.toString();
|
||||
var data = new Array();
|
||||
var parts = info.split(/[?&]/);
|
||||
for each(var part in parts) {
|
||||
var index = part.indexOf("=");
|
||||
if(index !== -1) {
|
||||
data[part.substr(0, index)] = part.substr(index+1);
|
||||
}
|
||||
}
|
||||
|
||||
if(data.ln) {
|
||||
var lastName = data.ln.replace(/\+/g, " ");
|
||||
var firstName = data.fn.replace(/\+/g, " ");
|
||||
} else {
|
||||
var lastName = data.fn.replace(/\+/g, " ");
|
||||
var firstName = "";
|
||||
}
|
||||
var dOb = data.by; // this does not get saved yet because no field is available; the info is in the snapshot
|
||||
if(data.rfd) {
|
||||
var yearRe = /([0-9]{4})/;
|
||||
var m = yearRe.exec(data.rfd);
|
||||
if(m) {
|
||||
var year = m[1];
|
||||
}
|
||||
} else { var year = data.ry; }
|
||||
if (year == 1890) {
|
||||
var yearDb = "1890orgcen";
|
||||
} else { var yearDb = year+"usfedcen"; }
|
||||
var state = data.rs.replace(/\+/g, " ");
|
||||
var county = data.rcnty.replace(/\+/g, " "); // this does not get saved yet because no field is available; the info is in the snapshot
|
||||
var city = data.rcty.replace(/\+/g, " ");
|
||||
var dbid = data.dbid;
|
||||
}
|
||||
|
||||
// set census number for citation - let me know if this can be done in a better way
|
||||
var censusYear = 0;
|
||||
var censusNo = "";
|
||||
var censusNos = new Array("1790", "First", "1800", "Second", "1810", "Third", "1820", "Fourth", "1830", "Fifth", "1840", "Sixth", "1850", "Seventh", "1860", "Eighth", "1870", "Ninth",
|
||||
"1880", "Tenth", "1890", "Eleventh", "1900", "Twelfth", "1910", "Thirteenth", "1920", "Fourteenth", "1930", "Fifteenth")
|
||||
for(var i in censusNos) {
|
||||
if(censusYear == 1) { censusNo = censusNos[i] };
|
||||
if(censusNos[i] == year) { censusYear = 1 } else {censusYear= 0 };
|
||||
}
|
||||
|
||||
//begin adding item
|
||||
var newItem = new Zotero.Item("bookSection");
|
||||
newItem.title = city+", "+state; // this is not proper citation but is needed to easily scan for placenames in middle pane
|
||||
newItem.publicationTitle = censusNo+" Census of the United States, "+year;
|
||||
newItem.publisher = "National Archives and Records Administration";
|
||||
newItem.place = "Washington, DC";
|
||||
newItem.date = year;
|
||||
|
||||
// get snapshot with all searchable text and a simplified link to the record for the URL field
|
||||
var snapshotRe = /recid=([0-9]+)/;
|
||||
var m = snapshotRe.exec(doc.location.href);
|
||||
if(m) {
|
||||
snapshotURL = "http://search.ancestry.com/cgi-bin/sse.dll?db="+yearDb+"&indiv=1&pf=1&recid="+m[1];
|
||||
newItem.attachments.push({title:"Ancestry.com Snapshot", mimeType:"text/html", url:snapshotURL, snapshot:true});
|
||||
cleanURL = "http://search.ancestry.com/cgi-bin/sse.dll?indiv=1&db="+yearDb+"&recid="+m[1];
|
||||
newItem.url = cleanURL;
|
||||
}
|
||||
|
||||
// add particular individual being surveyed as contributor - this is not proper citation but is needed so one could easily scan for names in middle pane
|
||||
var creator = new Array();
|
||||
creator.firstName = firstName;
|
||||
creator.lastName = lastName;
|
||||
creator.creatorType = "contributor";
|
||||
newItem.creators.push(creator);
|
||||
|
||||
//add proper author for citation
|
||||
var creator = new Array();
|
||||
creator.lastName = "United States of America, Bureau of the Census";
|
||||
creator.creatorType = "author";
|
||||
newItem.creators.push(creator);
|
||||
|
||||
// get scan of the census image
|
||||
var scanInfo = doc.evaluate(''//div[@class="g_container"]/div[@class="g_panelWrap"]/div[@class="g_panelCore"]/div[@class="g_main"]/div[@class="g_outerBox"]/div[@class="s_container"]/div[@class="g_box2"]/table[@class="p_recTable"]/tbody/tr/td[2][@class="recordTN"]/a'',
|
||||
doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
||||
|
||||
if(scanInfo) {
|
||||
var scanRe = /iid=([A-Z0-9_-]+)/;
|
||||
var m = scanRe.exec(scanInfo);
|
||||
if(m) {
|
||||
scanURL = "http://content.ancestry.com/Browse/print_u.aspx?dbid="+dbid+"&iid="+m[1];
|
||||
Zotero.debug("scan url: " + scanURL);
|
||||
}
|
||||
}
|
||||
|
||||
if(scanURL){
|
||||
Zotero.Utilities.HTTP.doGet(scanURL, function(text) {
|
||||
Zotero.debug("running doGet");
|
||||
Zotero.debug(text);
|
||||
var imageRe = /950 src="([^"]+)"/;
|
||||
var m = imageRe.exec(text);
|
||||
if(m) {
|
||||
imageURL = m[1];
|
||||
Zotero.debug("image url: " + imageURL);
|
||||
newItem.attachments.push({title:"Ancestry.com Image", mimeType:"image/jpeg", url:imageURL, snapshot:true});
|
||||
}
|
||||
|
||||
newItem.complete();
|
||||
Zotero.done();
|
||||
});
|
||||
} else {
|
||||
newItem.complete();
|
||||
Zotero.done();
|
||||
}
|
||||
}
|
||||
|
||||
function doWeb(doc, url) {
|
||||
var resultsRegexp = /recid=/;
|
||||
if(resultsRegexp.test(url)) {
|
||||
scrape(doc);
|
||||
} else {
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == ''x'') return namespace; else return null;
|
||||
} : null;
|
||||
|
||||
// get census year for links to items
|
||||
var yearRe = /db=([0-9]+)/;
|
||||
var m = yearRe.exec(doc.location.href);
|
||||
if(m) {
|
||||
year = m[1];
|
||||
}
|
||||
|
||||
//select items
|
||||
var items = new Array();
|
||||
var listElts = doc.evaluate(''//div[@class="g_container"]/div[@class="g_panelWrap"]/div[@class="g_panelCore"]/div[@class="s_container"]/div[@class="p_rsltList"]/table/tbody/tr[@class="tblrowalt record"] | //div[@class="g_container"]/div[@class="g_panelWrap"]/div[@class="g_panelCore"]/div[@class="s_container"]/div[@class="p_rsltList"]/table/tbody/tr[@class="tblrow record"]'',
|
||||
doc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||
var recid;
|
||||
var link;
|
||||
var name;
|
||||
while (listElt = listElts.iterateNext()) {
|
||||
recInfo = doc.evaluate(''.//a'', listElt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
||||
var recidRe = /^javascript:go[0-9]+_([0-9]+)/;
|
||||
var m = recidRe.exec(recInfo);
|
||||
if(m) {
|
||||
recid = m[1];
|
||||
}
|
||||
link = "http://search.ancestry.com/cgi-bin/sse.dll?indiv=1&db="+year+"usfedcen&recid="+recid;
|
||||
name = doc.evaluate(''.//span[@class="srchHit"]'', listElt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
|
||||
items[link] = Zotero.Utilities.cleanString(name);
|
||||
}
|
||||
|
||||
items = Zotero.selectItems(items);
|
||||
if(!items) return true;
|
||||
|
||||
var urls = new Array();
|
||||
for(var i in items) {
|
||||
urls.push(i);
|
||||
}
|
||||
|
||||
Zotero.Utilities.processDocuments(urls, scrape, function() { Zotero.done(); });
|
||||
Zotero.wait();
|
||||
|
||||
}
|
||||
}');
|
||||
|
||||
REPLACE INTO translators VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '1.0.0b3.r1', '', '2007-03-24 22:20:00', 1, 100, 4, 'OCLC WorldCat FirstSearch', 'Simon Kornblith', '^https?://(?:new)?firstsearch\.oclc\.org[^/]*/WebZ/',
|
||||
'function detectWeb(doc, url) {
|
||||
var detailRe = /FirstSearch: [\w ]+ Detailed Record/;
|
||||
|
|
Loading…
Reference in New Issue
Block a user