- added a "copy" feature to Scaffold, which copies a translator to the clipboard

- implemented ability to test regex and run detectCode from within Scaffold. it is now possible to generate an entire translator from within the environment. - added Factiva translator, which should work, although Factiva just went down for maintenance a few minutes ago
2006-12-17 01:27:42 +00:00 · 2006-12-17 01:27:42 +00:00 · 448faedab5
commit 448faedab5
parent cd557c2537
1 changed files with 126 additions and 2 deletions
--- a/scrapers.sql
+++ b/scrapers.sql
@ -1,4 +1,4 @@
-- 136
+-- 137

 --  ***** BEGIN LICENSE BLOCK *****
 --  
@ -22,7 +22,7 @@


 -- Set the following timestamp to the most recent scraper update date
-REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-12-16 16:29:00'));
+REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-11-06 20:20:46'));

 REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b3.r1', '', '2006-12-15 03:40:00', 1, 100, 4, 'Amazon.com', 'Sean Takats', '^https?://(?:www\.)?amazon', 
 'function detectWeb(doc, url) {
@ -4971,6 +4971,130 @@ REPLACE INTO translators VALUES ('92d4ed84-8d0-4d3c-941f-d4b9124cfbb', '1.0.0b2.
 	Zotero.wait();
 }');

+REPLACE INTO translators VALUES ('7bdb79e-a47f-4e3d-b317-ccd5a0a74456', '1.0.0b3r1', '', '2006-11-06 20:20:46', '1', '100', '4', 'Factiva', 'Simon Kornblith', '^http://global\.factiva\.com/ha/default\.aspx$', 
+'function detectWeb(doc, url) {
+	var namespace = doc.documentElement.namespaceURI;
+	var nsResolver = namespace ? function(prefix) {
+		if (prefix == ''x'') return namespace; else return null;
+	} : null;
+	
+	if(doc.evaluate(''//tr[@class="headline"]'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
+		if(doc.body.className == ''articleView'') {
+			return "newspaperArticle";
+		} else {
+			return "multiple";
+		}
+	}
+}', 
+'function doWeb(doc, url) {
+	var namespace = doc.documentElement.namespaceURI;
+	var nsResolver = namespace ? function(prefix) {
+		if (prefix == ''x'') return namespace; else return null;
+	} : null;
+	
+	var items = new Array();
+	var singlePage = doc.body.className == ''articleView'';
+	
+	var tableRows = doc.evaluate(''//tr[@class="headline"]'', doc, nsResolver, XPathResult.ANY_TYPE, null);
+	var tableRow;
+	while(tableRow = tableRows.iterateNext()) {
+		var hdl = doc.evaluate(''.//input[@name="hdl"]'', tableRow, nsResolver, XPathResult.ANY_TYPE,
+			null).iterateNext().value;
+		if(!singlePage){
+			items[hdl] = Zotero.Utilities.cleanString(tableRow.getElementsByTagName("a")[0].textContent);
+		} else {
+			var m = doc.evaluate(''.//td[@class="count"]'', tableRow, nsResolver, XPathResult.ANY_TYPE, 
+				null).iterateNext().textContent.match(/[0-9]+/);
+			items[m[0]] = hdl;
+		}
+	}
+	
+	if(!singlePage) {
+		items = Zotero.selectItems(items);
+		if(!items) return true;
+		
+		var hdls = new Array();
+		for(var hdl in items) {
+			hdls.push(hdl);
+		}
+	} else {
+		var m = doc.evaluate(''//div[@class="articleHeader"][@id="artHdr1"]/span[substring(text(), 1, 7) = "Article"]'',
+			doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent.match(/[0-9]+/);
+		var hdls = [items[m[0]]];
+	}
+	
+	var post = "";
+	
+	var hiddenInputs = doc.evaluate(''//form[@name="PageBaseForm"]//input[@type="hidden"]'', doc, nsResolver,
+		XPathResult.ANY_TYPE, null);
+	var hiddenInput;
+	while(hiddenInput = hiddenInputs.iterateNext()) {
+		// this is some weird shit, but apparently they''re very picky
+		post = post+"&"+hiddenInput.name+"="+escape(hiddenInput.value).replace(/\+/g, "%2B").replace(/\%20/g, "+");
+	}
+	
+	var selects = doc.evaluate(''//form[@name="PageBaseForm"]//select'', doc, nsResolver,
+		XPathResult.ANY_TYPE, null);
+	var select;
+	while(select = selects.iterateNext()) {
+		post = post+"&"+select.name+"="+escape(select.options[select.selectedIndex].value);
+	}
+	
+	for each(var hdl in hdls) {
+		post += "&hdl="+escape(hdl);
+	}
+	post = post.substr(1);
+	
+	Zotero.Utilities.HTTP.doPost("http://global.factiva.com/pps/default.aspx?pp=XML", post, function(text) {
+		// Remove xml parse instruction and doctype
+		text = text.replace(/<!DOCTYPE[^>]*>/, "").replace(/<\?xml[^>]*\?>/, "");
+		// kill the XML namespace, too, because we have no way of knowing what it will be, which presents a problem
+		text = text.replace(/<ppsArticleResponse xmlns="[^"]+">/, "<ppsArticleResponse>");
+		// kill hlt tags; they just make parsing harder
+		text = text.replace(/<\/?hlt>/g, "");
+		var xml = new XML(text);
+		
+		// loop through articles
+		for each(var ppsarticle in xml[0]..ppsarticle) {
+			var article = ppsarticle.article;
+			var newItem = new Zotero.Item("newspaperArticle");
+			
+			newItem.title = Zotero.Utilities.cleanString(article.headline.paragraph.text().toString());
+			newItem.publicationTitle = Zotero.Utilities.cleanString(article.sourceName.text().toString());
+			for each(var tag in article..name) {
+				newItem.tags.push(tag.text().toString());
+			}
+			newItem.date = Zotero.Utilities.formatDate(Zotero.Utilities.strToDate(article.publicationDate.date.text().toString()));
+			if(article.byline.length()) {
+				var byline = Zotero.Utilities.cleanString(article.byline.text().toString().replace(/By/i, ""));
+				var authors = byline.split(/ (?:\&|and) /i);
+				for each(var author in authors) {
+					newItem.creators.push(Zotero.Utilities.cleanAuthor(author, "author"));
+				}
+			}
+			newItem.section = article.sectionName.text().toString();
+			newItem.edition = article.edition.text().toString();
+			
+			if(article.pages.length()) {
+				newItem.pages = "";
+				for each(var page in article.pages.page) {
+					newItem.pages += ","+page.text().toString();
+				}
+				newItem.pages = newItem.pages.substr(1);
+			}
+			
+			var m = article.volume.text().toString().match(/ISSN[:\s]*([\-0-9]{8,9})/i);
+			if(m) newItem.ISSN = m[1];
+			
+			newItem.complete();
+		}
+		
+		Zotero.done();
+	});
+		
+	Zotero.wait();
+}');
+
 REPLACE INTO translators VALUES ('e07e9b8c-0e98-4915-bb5a-32a08cb2f365', '1.0.0b2.r2', '', '2006-10-02 17:00:00', 1, 100, 8, 'Open WorldCat', 'Simon Kornblith', 'http://partneraccess.oclc.org/',
 'function detectSearch(item) {
 	if(item.itemType == "book" || item.itemType == "bookSection") {