Adds Elena's Ancestry.com translator

2007-06-13 16:54:53 +00:00 · 2007-06-13 16:54:53 +00:00 · 7111f132c9
commit 7111f132c9
parent b4bfa6cf1a
1 changed files with 213 additions and 2 deletions
--- a/scrapers.sql
+++ b/scrapers.sql
@ -1,4 +1,4 @@
-- 234
+-- 235

 --  ***** BEGIN LICENSE BLOCK *****
 --  
@ -22,7 +22,7 @@


 -- Set the following timestamp to the most recent scraper update date
-REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2007-06-13 01:00:00'));
+REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2007-06-13 20:00:00'));

 REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b4.r1', '', '2007-03-21 15:26:54', '1', '100', '4', 'Amazon.com', 'Sean Takats', '^https?://(?:www\.)?amazon', 
 'function detectWeb(doc, url) {
@ -217,6 +217,217 @@ REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b
 	Zotero.wait();
 }');

+REPLACE INTO translators VALUES ('0dda3f89-15de-4479-987f-cc13f1ba7999', '1.0.0b3r1', '', '2007-06-13 20:00:00', '0', '100', '4', 'Ancestry.com US Federal Census', 'Elena Razlogova', '^https?://search.ancestry.com/(.*)usfedcen|1890orgcen', 
+'function detectWeb(doc, url) {
+	var namespace = doc.documentElement.namespaceURI;
+	var nsResolver = namespace ? function(prefix) {
+		if (prefix == ''x'') return namespace; else return null;
+	} : null;
+		
+	var result = doc.evaluate(''//div[@class="g_container"]/div[@class="g_panelWrap"]/div[@class="g_panelCore"]/div[@class="s_container"]/div[@class="p_rsltList"]'', doc, nsResolver,
+	             XPathResult.ANY_TYPE, null).iterateNext();
+
+	var rows = doc.evaluate(''//div[@class="g_container"]/div[@class="g_panelWrap"]/div[@class="g_panelCore"]/div[@class="s_container"]/div[@class="p_rsltList"]/table/tbody/tr[@class="tblrow record"]'', 
+				doc, nsResolver, XPathResult.ANY_TYPE, null);
+	var row;
+	while(row = rows.iterateNext()) {
+		links = doc.evaluate(''.//a'', row, nsResolver, XPathResult.ANY_TYPE, null);
+		var linkNo=0;
+		while(link=links.iterateNext()) {
+			linkNo=linkNo+1;
+		}
+		break;
+	}
+	
+	if(result && linkNo == 2) {
+		return "multiple";
+	} else {
+		var loggedIn = doc.evaluate(''//a[@id="_ctl16__ctl4_m_logout"]|//a[@id="_ctl18__ctl4_m_logout"]'', doc, nsResolver,
+	             XPathResult.ANY_TYPE, null).iterateNext();
+	             
+		checkURL = doc.location.href.replace("pf=", "").replace("&h=", "");
+		if(doc.location.href == checkURL && loggedIn) {
+			return "bookSection";
+		}
+	} 
+}', 
+'// this US Federal Census scraper is a hack - so far there is no proper item type in Zotero for this kind of data (added to trac as a low priority ticket)
+// this scraper creates proper citation for the census as a whole (should be cited as book)
+// but also adds name, city, and state for a particular individual to the citation to make scanning for names & places easier in the middle pane 
+// (that''s why the resulting item type is a book section) 
+// it also adds all searchable text as a snapshot and a scan of the census record as an image
+
+function scrape(doc) {
+	var namespace = doc.documentElement.namespaceURI;
+	var nsResolver = namespace ? function(prefix) {
+		if (prefix == ''x'') return namespace; else return null;
+	} : null;
+	
+	// get initial census data; a proper census record item type should have separate fields for all of these except perhaps dbid
+	var info = doc.evaluate(''//div[@class="g_container"]/div[@class="g_panelWrap"]/div[@class="g_panelCore"]/div[@class="g_right"]/div[@class="g_box"]/p/a'', 
+		doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();	
+		
+	if(info) {	
+		
+		info = info.toString();
+		var data = new Array();
+		var parts = info.split(/[?&]/);
+		for each(var part in parts) {
+			var index = part.indexOf("=");
+			if(index !== -1) {
+				data[part.substr(0, index)] = part.substr(index+1);
+			}
+		}
+		
+		if(data.ln) {
+			var lastName = data.ln.replace(/\+/g, " ");
+			var firstName = data.fn.replace(/\+/g, " ");
+		} else { 
+			var lastName = data.fn.replace(/\+/g, " ");
+			var firstName = ""; 
+		}
+		var dOb = data.by; // this does not get saved yet because no field is available; the info is in the snapshot
+		if(data.rfd) {
+			var yearRe = /([0-9]{4})/;
+			var m = yearRe.exec(data.rfd);
+			if(m) { 
+				var year = m[1];
+			}
+		} else { var year = data.ry; }
+		if (year == 1890) {
+			var yearDb = "1890orgcen";
+		} else { var yearDb = year+"usfedcen"; }
+		var state = data.rs.replace(/\+/g, " "); 
+		var county = data.rcnty.replace(/\+/g, " "); // this does not get saved yet because no field is available; the info is in the snapshot
+		var city = data.rcty.replace(/\+/g, " "); 
+		var dbid = data.dbid;
+	}
+	
+	// set census number for citation - let me know if this can be done in a better way
+	var censusYear = 0;
+	var censusNo = "";
+	var censusNos = new Array("1790", "First", "1800", "Second", "1810", "Third", "1820", "Fourth", "1830", "Fifth", "1840", "Sixth", "1850", "Seventh", "1860", "Eighth", "1870", "Ninth", 
+			"1880", "Tenth", "1890", "Eleventh", "1900", "Twelfth", "1910", "Thirteenth", "1920", "Fourteenth", "1930", "Fifteenth")
+	for(var i in censusNos) {
+			if(censusYear == 1) { censusNo = censusNos[i] };
+			if(censusNos[i] == year) { censusYear = 1 } else {censusYear= 0 };
+		}
+
+	//begin adding item
+	var newItem = new Zotero.Item("bookSection");
+	newItem.title = city+", "+state; // this is not proper citation but is needed to easily scan for placenames in middle pane
+	newItem.publicationTitle = censusNo+" Census of the United States, "+year;
+	newItem.publisher = "National Archives and Records Administration";
+	newItem.place = "Washington, DC";
+	newItem.date = year;
+	
+	// get snapshot with all searchable text and a simplified link to the record for the URL field
+	var snapshotRe = /recid=([0-9]+)/;
+	var m = snapshotRe.exec(doc.location.href);
+	if(m) {
+		snapshotURL = "http://search.ancestry.com/cgi-bin/sse.dll?db="+yearDb+"&indiv=1&pf=1&recid="+m[1];
+		newItem.attachments.push({title:"Ancestry.com Snapshot", mimeType:"text/html", url:snapshotURL, snapshot:true});
+		cleanURL = "http://search.ancestry.com/cgi-bin/sse.dll?indiv=1&db="+yearDb+"&recid="+m[1];
+		newItem.url = cleanURL;
+	}
+			
+	// add particular individual being surveyed as contributor - this is not proper citation but is needed so one could easily scan for names in middle pane
+	var creator = new Array();
+	creator.firstName = firstName;
+	creator.lastName = lastName;
+	creator.creatorType = "contributor";
+	newItem.creators.push(creator);
+	
+	//add proper author for citation
+	var creator = new Array();
+	creator.lastName = "United States of America, Bureau of the Census";
+	creator.creatorType = "author";
+	newItem.creators.push(creator);
+
+	// get scan of the census image
+	var scanInfo = doc.evaluate(''//div[@class="g_container"]/div[@class="g_panelWrap"]/div[@class="g_panelCore"]/div[@class="g_main"]/div[@class="g_outerBox"]/div[@class="s_container"]/div[@class="g_box2"]/table[@class="p_recTable"]/tbody/tr/td[2][@class="recordTN"]/a'', 
+		doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+	
+	if(scanInfo) {
+		var scanRe = /iid=([A-Z0-9_-]+)/;		
+		var m = scanRe.exec(scanInfo);
+		if(m) {
+			scanURL = "http://content.ancestry.com/Browse/print_u.aspx?dbid="+dbid+"&iid="+m[1];
+			Zotero.debug("scan url: " + scanURL);
+		}
+	}
+	
+	if(scanURL){
+		Zotero.Utilities.HTTP.doGet(scanURL, function(text) { 
+			Zotero.debug("running doGet");
+			Zotero.debug(text);
+			var imageRe = /950  src="([^"]+)"/;
+			var m = imageRe.exec(text);
+				if(m) {
+					imageURL = m[1];
+					Zotero.debug("image url: " + imageURL);
+					newItem.attachments.push({title:"Ancestry.com Image", mimeType:"image/jpeg", url:imageURL, snapshot:true});
+				}
+			
+			newItem.complete();
+			Zotero.done();	
+		});	
+	} else {
+		newItem.complete();
+		Zotero.done();
+	}
+}
+
+function doWeb(doc, url) {
+	var resultsRegexp = /recid=/;
+	if(resultsRegexp.test(url)) {
+		scrape(doc);
+	} else {
+		var namespace = doc.documentElement.namespaceURI;
+		var nsResolver = namespace ? function(prefix) {
+			if (prefix == ''x'') return namespace; else return null;
+		} : null;
+		
+		// get census year for links to items
+		var yearRe = /db=([0-9]+)/;
+		var m = yearRe.exec(doc.location.href);
+		if(m) {
+			year = m[1];
+		}
+		
+		//select items
+		var items = new Array();
+		var listElts = doc.evaluate(''//div[@class="g_container"]/div[@class="g_panelWrap"]/div[@class="g_panelCore"]/div[@class="s_container"]/div[@class="p_rsltList"]/table/tbody/tr[@class="tblrowalt record"] | //div[@class="g_container"]/div[@class="g_panelWrap"]/div[@class="g_panelCore"]/div[@class="s_container"]/div[@class="p_rsltList"]/table/tbody/tr[@class="tblrow record"]'', 
+				doc, nsResolver, XPathResult.ANY_TYPE, null);
+		var recid;
+		var link;
+		var name;
+		while (listElt = listElts.iterateNext()) {		
+			recInfo = doc.evaluate(''.//a'', listElt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+			var recidRe = /^javascript:go[0-9]+_([0-9]+)/;
+			var m = recidRe.exec(recInfo);
+			if(m) {
+				recid = m[1];
+			}
+			link = "http://search.ancestry.com/cgi-bin/sse.dll?indiv=1&db="+year+"usfedcen&recid="+recid;
+			name = doc.evaluate(''.//span[@class="srchHit"]'', listElt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
+			items[link] = Zotero.Utilities.cleanString(name);
+		} 
+
+		items = Zotero.selectItems(items);
+		if(!items) return true;
+
+		var urls = new Array();
+		for(var i in items) {
+			urls.push(i);
+		}
+		
+		Zotero.Utilities.processDocuments(urls, scrape, function() { Zotero.done(); });
+		Zotero.wait();
+
+	}
+}');
+
 REPLACE INTO translators VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '1.0.0b3.r1', '', '2007-03-24 22:20:00', 1, 100, 4, 'OCLC WorldCat FirstSearch', 'Simon Kornblith', '^https?://(?:new)?firstsearch\.oclc\.org[^/]*/WebZ/',
 'function detectWeb(doc, url) {
 	var detailRe = /FirstSearch: [\w ]+ Detailed Record/;