-Fixes some stuff in the SciELO translator.
This commit is contained in:
parent
abbc8f0397
commit
c5e3d0c359
130
scrapers.sql
130
scrapers.sql
|
@ -22,7 +22,7 @@
|
|||
|
||||
|
||||
-- Set the following timestamp to the most recent scraper update date
|
||||
REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2008-05-14 14:30:00'));
|
||||
REPLACE INTO version VALUES ('repository', STRFTIME('%s', '2008-05-14 15:00:00'));
|
||||
|
||||
REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b4.r1', '', '2008-03-21 20:00:00', '1', '100', '4', 'Amazon.com', 'Sean Takats and Michael Berkowitz', '^https?://(?:www\.)?amazon',
|
||||
'function detectWeb(doc, url) {
|
||||
|
@ -4987,24 +4987,36 @@ REPLACE INTO translators VALUES ('636c8ea6-2af7-4488-8ccd-ea280e4a7a98', '1.0.0b
|
|||
Zotero.wait();
|
||||
}');
|
||||
|
||||
REPLACE INTO translators VALUES ('3eabecf9-663a-4774-a3e6-0790d2732eed', '1.0.0b4.r5', '', '2008-04-28 17:50:00', '0', '100', '4', 'SciELO', 'Michael Berkowitz', 'http://www.scielo.(org|br)/',
|
||||
REPLACE INTO translators VALUES ('3eabecf9-663a-4774-a3e6-0790d2732eed', '1.0.0b4.r5', '', '2008-05-14 15:00:00', '1', '100', '4', 'SciELO', 'Michael Berkowitz', 'http://(www.)?scielo.(org|br)/',
|
||||
'function detectWeb(doc, url) {
|
||||
if (url.indexOf("wxis.exe") != -1) {
|
||||
if (doc.evaluate(''//*[@class="isoref"]'', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
|
||||
return "multiple";
|
||||
}
|
||||
} else if (url.indexOf("issuetoc") != -1) {
|
||||
return "multiple"
|
||||
} else if (url.indexOf("&pid=") != -1) {
|
||||
return "journalArticle";
|
||||
} else {
|
||||
Zotero.debug("ok");
|
||||
}
|
||||
}',
|
||||
'function doWeb(doc, url) {
|
||||
'function makeURL(host, str) {
|
||||
return ''http://'' + host + ''/scieloOrg/php/articleXML.php?pid='' + str.match(/pid=([^&]+)/)[1] + ''&lang=en'';
|
||||
}
|
||||
|
||||
function doWeb(doc, url) {
|
||||
var host = doc.location.host;
|
||||
var arts = new Array();
|
||||
if (detectWeb(doc, url) == "multiple") {
|
||||
var items = new Object();
|
||||
var titlepath = ''//font[@class="isoref"]/font[@class="negrito"]'';
|
||||
var linkpath = ''//font[@class="isoref"]/a[@class="isoref"]'';
|
||||
if (url.indexOf(".exe") != -1) {
|
||||
var titlepath = ''//font[@class="isoref"]/font[@class="negrito"]'';
|
||||
var linkpath = ''//font[@class="isoref"]/a[@class="isoref"]'';
|
||||
} else {
|
||||
var titlepath = ''//font[@class="normal"]/b/b[1]'';
|
||||
var linkpath = ''//tr/td/div/a[1]'';
|
||||
}
|
||||
var titles = doc.evaluate(titlepath, doc, null, XPathResult.ANY_TYPE, null);
|
||||
var links = doc.evaluate(linkpath, doc, null, XPathResult.ANY_TYPE, null);
|
||||
var next_title;
|
||||
|
@ -5014,69 +5026,59 @@ REPLACE INTO translators VALUES ('3eabecf9-663a-4774-a3e6-0790d2732eed', '1.0.0b
|
|||
}
|
||||
items = Zotero.selectItems(items);
|
||||
for (var i in items) {
|
||||
arts.push(i);
|
||||
arts.push(makeURL(host, i));
|
||||
}
|
||||
} else {
|
||||
arts = [url];
|
||||
arts = [makeURL(host, url)];
|
||||
}
|
||||
for each (var url in arts) {
|
||||
Zotero.debug(url);
|
||||
Zotero.Utilities.HTTP.doGet(url, function(text) {
|
||||
var link = text.match(/\"([^"]+articleXML[^"]+)\"/)[1];
|
||||
Zotero.debug(link);
|
||||
|
||||
Zotero.Utilities.HTTP.doGet(link, function(text) {
|
||||
var item = new Zotero.Item("journalArticle");
|
||||
|
||||
text = text.replace(/<!DOCTYPE[^>]*>/, "").replace(/<\?xml[^>]*\?>/, "").replace(/<self-uri.*\/self\-uri>/g, "");
|
||||
var journal = text.split("<journal-meta>")[1].split("</journal-meta>")[0];
|
||||
journal = "<journal>" + journal + "</journal>";
|
||||
journal = journal.replace(/\-([a-z])/g, "$1");
|
||||
var xml2 = new XML(journal);
|
||||
var art = text.split("<article-meta>")[1].split("</article-meta>")[0];
|
||||
art = "<article>" + art + "</article>";
|
||||
art = art.replace(/\-([a-z])/g, "$1");
|
||||
var xml3 = new XML(art);
|
||||
|
||||
item.publicationTitle = xml2..journaltitle.text().toString();
|
||||
item.journalAbbreviation = xml2..abbrevjournaltitle.text().toString();
|
||||
item.ISSN = xml2..issn.text().toString();
|
||||
item.publisher = xml2..publisher..publishername.text().toString();
|
||||
|
||||
item.title = xml3..titlegroup..articletitle.text().toString();
|
||||
for (var i = 0 ; i < xml3..contribgroup..contrib.length() ; i++) {
|
||||
var name = xml3..contribgroup..contrib[i]..name;
|
||||
item.creators.push({firstName:name..givennames.text().toString(), lastName:name..surname.text().toString(), creatorType:"author"});
|
||||
}
|
||||
|
||||
var date = xml3..pubdate[0];
|
||||
var day = date..day.text().toString();
|
||||
var month = date..month.text().toString();
|
||||
var year = date..year.text().toString();
|
||||
|
||||
date = year;
|
||||
if (month != "00") {
|
||||
date = month + "/" + date;
|
||||
}
|
||||
if (day != "00") {
|
||||
date = day + "/" + date;
|
||||
}
|
||||
item.date = date;
|
||||
item.volume = xml3..volume.text().toString();
|
||||
item.pages = xml3..fpage.text().toString() + "-" + xml3..lpage.text().toString();
|
||||
|
||||
for (var i = 0 ; i < xml3..kwdgroup..kwd.length() ; i++) {
|
||||
item.tags.push(xml3..kwdgroup..kwd[i].text().toString());
|
||||
}
|
||||
|
||||
item.attachments = [
|
||||
{url:url, title:"SciELO Snapshot", mimeType:"text/html"}
|
||||
];
|
||||
Zotero.Utilities.HTTP.doGet(arts, function(text) {
|
||||
var item = new Zotero.Item("journalArticle");
|
||||
text = text.replace(/<!DOCTYPE[^>]*>/, "").replace(/<\?xml[^>]*\?>/, "").replace(/<self-uri.*\/self\-uri>/g, "");
|
||||
var journal = text.split("<journal-meta>")[1].split("</journal-meta>")[0];
|
||||
journal = "<journal>" + journal + "</journal>";
|
||||
journal = journal.replace(/\-([a-z])/g, "$1");
|
||||
var xml2 = new XML(journal);
|
||||
var art = text.split("<article-meta>")[1].split("</article-meta>")[0];
|
||||
art = "<article>" + art + "</article>";
|
||||
art = art.replace(/\-([a-z])/g, "$1");
|
||||
var xml3 = new XML(art);
|
||||
|
||||
item.complete();
|
||||
});
|
||||
});
|
||||
}
|
||||
item.publicationTitle = xml2..journaltitle.text().toString();
|
||||
item.journalAbbreviation = xml2..abbrevjournaltitle.text().toString();
|
||||
item.ISSN = xml2..issn.text().toString();
|
||||
item.publisher = xml2..publisher..publishername.text().toString();
|
||||
item.title = xml3..titlegroup..articletitle.text().toString();
|
||||
for (var i = 0 ; i < xml3..contribgroup..contrib.length() ; i++) {
|
||||
var name = xml3..contribgroup..contrib[i]..name;
|
||||
item.creators.push({firstName:name..givennames.text().toString(), lastName:name..surname.text().toString(), creatorType:"author"});
|
||||
}
|
||||
|
||||
var date = xml3..pubdate[0];
|
||||
var day = date..day.text().toString();
|
||||
var month = date..month.text().toString();
|
||||
var year = date..year.text().toString();
|
||||
|
||||
date = year;
|
||||
if (month != "00") {
|
||||
date = month + "/" + date;
|
||||
}
|
||||
if (day != "00") {
|
||||
date = day + "/" + date;
|
||||
}
|
||||
item.date = date;
|
||||
item.volume = xml3..volume.text().toString();
|
||||
item.pages = xml3..fpage.text().toString() + "-" + xml3..lpage.text().toString();
|
||||
|
||||
for (var i = 0 ; i < xml3..kwdgroup..kwd.length() ; i++) {
|
||||
item.tags.push(xml3..kwdgroup..kwd[i].text().toString());
|
||||
}
|
||||
|
||||
item.attachments = [
|
||||
{url:url, title:"SciELO Snapshot", mimeType:"text/html"}
|
||||
];
|
||||
|
||||
item.complete();
|
||||
});
|
||||
}');
|
||||
|
||||
REPLACE INTO translators VALUES ('0a84a653-79ea-4c6a-8a68-da933e3b504a', '1.0.0b4.r5', '', '2008-03-28 16:30:00', '0', '100', '4', 'Alexander Street Press', 'John West and Michael Berkowitz', 'http://(?:www\.)alexanderstreet',
|
||||
|
|
Loading…
Reference in New Issue
Block a user