-Adds two more of Adam's translators.
This commit is contained in:
parent
e7c3de1b1c
commit
bae0a37d39
239
scrapers.sql
239
scrapers.sql
|
@ -1091,6 +1091,245 @@ REPLACE INTO translators VALUES ('88915634-1af6-c134-0171-56fd198235ed', '1.0.0b
|
|||
Zotero.wait();
|
||||
}');
|
||||
|
||||
REPLACE INTO translators VALUES ('a7c8b759-6f8a-4875-9d6e-cc0a99fe8f43', '1.0.0b4.r5', '', '2008-06-20 09:39:46', '0', '100', '4', 'Canadian Letters and Images', 'Adam Crymble', 'http://(www.)?canadianletters.ca/',
|
||||
'function detectWeb(doc, url) {
|
||||
if (doc.location.href.match("results")) {
|
||||
return "multiple";
|
||||
} else if (doc.location.href.match("letters.php")) {
|
||||
return "document";
|
||||
} else if (doc.location.href.match("template")) {
|
||||
return "artwork";
|
||||
}
|
||||
|
||||
}',
|
||||
'//Translator for Canadian Letters and Images. Code by Adam Crymble
|
||||
|
||||
|
||||
function scrape(doc, url) {
|
||||
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == ''x'') return namespace; else return null;
|
||||
} : null;
|
||||
|
||||
var dataTags = new Object();
|
||||
|
||||
var mediaType = (detectWeb(doc, url));
|
||||
if (mediaType == "document") {
|
||||
var newItem = new Zotero.Item("letter");
|
||||
var title2;
|
||||
|
||||
//title
|
||||
if (doc.evaluate(''//h3'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
|
||||
newItem.title = doc.evaluate(''//h3'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
|
||||
} else {
|
||||
newItem.title = doc.title;
|
||||
}
|
||||
|
||||
//letter, diary, memoir, personal item
|
||||
if (doc.evaluate(''//div[@id="collectionCategory_letters"]'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
|
||||
|
||||
var xPathType = doc.evaluate(''//div[@id="collectionCategory_letters"]'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
|
||||
newItem.type = xPathType;
|
||||
}
|
||||
|
||||
//gets date, to and from
|
||||
if (doc.evaluate(''//div[@class="letterInfo_label"]'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
|
||||
var xPathHeaders = doc.evaluate(''//div[@class="letterInfo_label"]'', doc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||
var xPathContent = doc.evaluate(''//div[@class="letterInfo_title"]'', doc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||
var xPathCount = doc.evaluate(''count (//div[@class="letterInfo_label"])'', doc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||
|
||||
for (i=0; i<xPathCount.numberValue; i++) {
|
||||
fieldTitle=xPathHeaders.iterateNext().textContent.replace(/\s+/g, '''');
|
||||
dataTags[fieldTitle] = xPathContent.iterateNext().textContent.replace(/^\s*|\s*$/g, '''');
|
||||
|
||||
if (fieldTitle == "To:") {
|
||||
|
||||
newItem.abstractNote = ("To: " + dataTags[fieldTitle]);
|
||||
|
||||
} else if (fieldTitle == "From:") {
|
||||
|
||||
newItem.creators.push(Zotero.Utilities.cleanAuthor(dataTags[fieldTitle], "author"));
|
||||
|
||||
} else if (fieldTitle == "Date:") {
|
||||
|
||||
newItem.date = dataTags[fieldTitle];
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (mediaType == "artwork") {
|
||||
|
||||
newItem = new Zotero.Item("artwork");
|
||||
|
||||
if (doc.evaluate(''//div[@class="pictureDisplay"]'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
|
||||
newItem.title = doc.evaluate(''//div[@class="pictureDisplay"]'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
|
||||
} else {
|
||||
newItem.title = doc.title;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
newItem.url = doc.location.href;
|
||||
|
||||
newItem.complete();
|
||||
}
|
||||
|
||||
function doWeb(doc, url) {
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == ''x'') return namespace; else return null;
|
||||
} : null;
|
||||
|
||||
var articles = new Array();
|
||||
|
||||
if (detectWeb(doc, url) == "multiple") {
|
||||
var items = new Object();
|
||||
|
||||
var titles = doc.evaluate(''//div[@class="searchResultsDisplay"]/div/a'', doc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||
|
||||
var next_title;
|
||||
while (next_title = titles.iterateNext()) {
|
||||
items[next_title.href] = next_title.textContent;
|
||||
}
|
||||
items = Zotero.selectItems(items);
|
||||
for (var i in items) {
|
||||
articles.push(i);
|
||||
}
|
||||
} else {
|
||||
articles = [url];
|
||||
}
|
||||
Zotero.Utilities.processDocuments(articles, scrape, function() {Zotero.done();});
|
||||
Zotero.wait();
|
||||
}');
|
||||
|
||||
REPLACE INTO translators VALUES ('3f44a651-8b6b-4591-8ca4-4bfb943a13f4', '1.0.0b4.r5', '', '2008-06-20 10:02:00', '0', '100', '4', 'Edutopia', 'Adam Crymble', 'http://www.edutopia.org',
|
||||
'function detectWeb(doc, url) {
|
||||
|
||||
var blog1 = 0;
|
||||
|
||||
if (doc.title.match("blog")) {
|
||||
blog1 = 1;
|
||||
}
|
||||
|
||||
if (doc.location.href.match("search")) {
|
||||
return "multiple";
|
||||
} else if (blog1 == 0 && doc.evaluate(''//h1'', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
|
||||
return "newspaperArticle";
|
||||
}
|
||||
}',
|
||||
'//Edutopia.org translator. Code by Adam Crymble
|
||||
|
||||
function associateMeta(newItem, metaTags, field, zoteroField) {
|
||||
if(metaTags[field]) {
|
||||
newItem[zoteroField] = metaTags[field];
|
||||
}
|
||||
}
|
||||
|
||||
function scrape(doc, url) {
|
||||
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == ''x'') return namespace; else return null;
|
||||
} : null;
|
||||
|
||||
var author;
|
||||
var authorCheck = 0;
|
||||
var tagsContent = new Array();
|
||||
|
||||
var newItem = new Zotero.Item("newspaperArticle");
|
||||
|
||||
//title
|
||||
var title1 = doc.title.split("|");
|
||||
newItem.title = title1[0];
|
||||
|
||||
//author
|
||||
if (doc.evaluate(''//div[@id="article"]/h4/a'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
|
||||
|
||||
author = doc.evaluate(''//div[@id="article"]/h4/a'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
|
||||
authorCheck = 1;
|
||||
|
||||
} else if (doc.evaluate(''//div[@id="pollpage"]/p/a'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
|
||||
|
||||
author = doc.evaluate(''//div[@id="pollpage"]/p/a'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
|
||||
authorCheck = 1;
|
||||
|
||||
} else if (doc.evaluate(''//div[@class="blog"]/h4'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
|
||||
|
||||
author = doc.evaluate(''//div[@class="blog"]/h4'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
|
||||
authorCheck = 1;
|
||||
}
|
||||
|
||||
if (authorCheck == 1) {
|
||||
if (author.toLowerCase().match(/^by /)) {
|
||||
author = author.substr(3);
|
||||
}
|
||||
Zotero.debug(author);
|
||||
newItem.creators.push(Zotero.Utilities.cleanAuthor(author, "author"));
|
||||
}
|
||||
|
||||
|
||||
//abstract
|
||||
if (doc.evaluate(''//div[@class="dek"]/h3'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
|
||||
newItem.abstractNote = (doc.evaluate(''//div[@class="dek"]/h3'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent);
|
||||
}
|
||||
|
||||
//date
|
||||
if (doc.evaluate(''/span[@class="blog_date"]'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
|
||||
newItem.date = doc.evaluate(''/span[@class="blog_date"]'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
|
||||
}
|
||||
|
||||
var metaTags = new Object();
|
||||
|
||||
var metaTagHTML = doc.getElementsByTagName("meta");
|
||||
for (var i = 0 ; i < metaTagHTML.length ; i++) {
|
||||
metaTags[metaTagHTML[i].getAttribute("name")] = Zotero.Utilities.cleanTags(metaTagHTML[i].getAttribute("content"));
|
||||
}
|
||||
|
||||
if (metaTags["keywords"]) {
|
||||
tagsContent = (metaTags["keywords"].split('', ''));
|
||||
}
|
||||
|
||||
for (var i = 0; i < tagsContent.length; i++) {
|
||||
newItem.tags[i] = tagsContent[i];
|
||||
}
|
||||
|
||||
associateMeta (newItem, metaTags, "description", "abstractNote");
|
||||
|
||||
newItem.publication = "Edutopia.org"
|
||||
newItem.url = doc.location.href;
|
||||
|
||||
newItem.complete();
|
||||
}
|
||||
|
||||
function doWeb(doc, url) {
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == ''x'') return namespace; else return null;
|
||||
} : null;
|
||||
|
||||
var articles = new Array();
|
||||
|
||||
if (detectWeb(doc, url) == "multiple") {
|
||||
var items = new Object();
|
||||
|
||||
var titles = doc.evaluate(''//dt[@class="title"]/a'', doc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||
|
||||
var next_title;
|
||||
while (next_title = titles.iterateNext()) {
|
||||
items[next_title.href] = next_title.textContent;
|
||||
}
|
||||
items = Zotero.selectItems(items);
|
||||
for (var i in items) {
|
||||
articles.push(i);
|
||||
}
|
||||
} else {
|
||||
articles = [url];
|
||||
}
|
||||
Zotero.Utilities.processDocuments(articles, scrape, function() {Zotero.done();});
|
||||
Zotero.wait();
|
||||
}');
|
||||
|
||||
REPLACE INTO translators VALUES ('18bc329c-51af-497e-a7cf-aa572fae363d', '1.0.0b4.r5', '', '2008-06-18 10:26:16', '0', '100', '4', 'Archives Canada', 'Adam Crymble', 'http://(www.)?archivescanada.ca',
|
||||
'function detectWeb (doc, url) {
|
||||
if (doc.location.href.match("RouteRqst")) {
|
||||
|
|
Loading…
Reference in New Issue
Block a user