Amazon scraper now supports international Amazon sites and retrieves data from Amazon's API
This commit is contained in:
parent
a5594cb4dc
commit
6ff2168729
264
scrapers.sql
264
scrapers.sql
|
@ -1,4 +1,4 @@
|
|||
-- 104
|
||||
-- 105
|
||||
|
||||
-- ***** BEGIN LICENSE BLOCK *****
|
||||
--
|
||||
|
@ -22,140 +22,194 @@
|
|||
|
||||
|
||||
-- Set the following timestamp to the most recent scraper update date
|
||||
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-11-20 23:10:00'));
|
||||
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-11-21 22:30:00'));
|
||||
|
||||
REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-10-02 17:00:00', 1, 100, 4, 'Amazon.com', 'Simon Kornblith', '^http://www\.amazon\.com/',
|
||||
REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-11-21 22:30:00', 1, 100, 12, 'Amazon', 'Sean Takats', '^http://(?:www\.)amazon',
|
||||
'function detectWeb(doc, url) {
|
||||
var searchRe = new RegExp(''^http://(?:www\.)?amazon\.com/(gp/search/|exec/obidos/search-handle-url/|s/)'');
|
||||
|
||||
var suffixRe = new RegExp("http://(?:www\.)amazon\.([^/]+)/");
|
||||
var suffixMatch = suffixRe.exec(url);
|
||||
var suffix = suffixMatch[1];
|
||||
var searchRe = new RegExp(''^http://(?:www\.)?amazon\.'' + suffix + ''/(gp/search/|exec/obidos/search-handle-url/|s/)'');
|
||||
if(searchRe.test(doc.location.href)) {
|
||||
return "multiple";
|
||||
} else {
|
||||
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == ''x'') return namespace; else return null;
|
||||
} : null;
|
||||
|
||||
var xpath = ''/html/body/table/tbody/tr/td[2]/table/tbody/tr/td[@class="bucket"]/div[@class="content"]/ul/li'';
|
||||
var xpath = ''//input[@name="ASIN"]'';
|
||||
if(doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
|
||||
return "book";
|
||||
elmt = doc.evaluate(''//input[@name="storeID"]'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
|
||||
var storeID = Zotero.Utilities.getNodeString(doc, elmt, ''./@value'', nsResolver);
|
||||
Zotero.Utilities.debug("store id: " + storeID);
|
||||
if (storeID=="books"){
|
||||
return "book";
|
||||
}
|
||||
else if (storeID=="music"){
|
||||
return "audioRecording";
|
||||
}
|
||||
else if (storeID=="dvd"|storeID=="video"){
|
||||
return "videoRecording";
|
||||
}
|
||||
else {
|
||||
return "book";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
',
|
||||
'function scrape(doc) {
|
||||
'function doWeb(doc, url) {
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == ''x'') return namespace; else return null;
|
||||
} : null;
|
||||
|
||||
var newItem = new Zotero.Item("book");
|
||||
|
||||
// Retrieve authors
|
||||
try {
|
||||
var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/a/text()[1]'';
|
||||
var elmts = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||
var elmt;
|
||||
while(elmt = elmts.iterateNext()) {
|
||||
newItem.creators.push(Zotero.Utilities.cleanAuthor(elmt.nodeValue, "author"));
|
||||
}
|
||||
} catch(ex) {Zotero.Utilities.debug(ex);}
|
||||
|
||||
// Retrieve data from "Product Details" box
|
||||
var xpath = ''/html/body/table/tbody/tr/td[2]/table/tbody/tr/td[@class="bucket"]/div[@class="content"]/ul/li'';
|
||||
var elmts = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||
var elmt;
|
||||
|
||||
newItem.extra = "";
|
||||
while(elmt = elmts.iterateNext()) {
|
||||
try {
|
||||
var attribute = Zotero.Utilities.cleanString(doc.evaluate(''./B[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue);
|
||||
var value = Zotero.Utilities.getNodeString(doc, elmt, ''./descendant-or-self::*[name() != "B"]/text()'', nsResolver);
|
||||
if(value) {
|
||||
value = Zotero.Utilities.cleanString(value);
|
||||
|
||||
if(attribute == "Publisher:") {
|
||||
if(value.lastIndexOf("(") != -1) {
|
||||
newItem.date = value.substring(value.lastIndexOf("(")+1, value.length-1);
|
||||
|
||||
value = value.substring(0, value.lastIndexOf("(")-1);
|
||||
}
|
||||
if(value.lastIndexOf(";") != -1) {
|
||||
newItem.edition = value.substring(value.lastIndexOf(";")+2, value.length);
|
||||
|
||||
value = value.substring(0, value.lastIndexOf(";"));
|
||||
}
|
||||
newItem.publisher = value;
|
||||
} else if(attribute == "ISBN:") {
|
||||
newItem.ISBN = value;
|
||||
} else if(value.substring(value.indexOf(" ")+1, value.length) == "pages") {
|
||||
newItem.pages = value.substring(0, value.indexOf(" "));
|
||||
} else if(attribute != "Average Customer Review:") {
|
||||
if(attribute == "In-Print Editions:") {
|
||||
value = value.replace(" | All Editions", "");
|
||||
} else {
|
||||
value = value.replace(/\([^)]*\)/g, "");
|
||||
}
|
||||
|
||||
newItem.extra += attribute+" "+value+"\n";
|
||||
}
|
||||
}
|
||||
} catch(ex) {}
|
||||
}
|
||||
|
||||
if(newItem.extra) {
|
||||
newItem.extra = newItem.extra.substr(0, newItem.extra.length-1);
|
||||
}
|
||||
|
||||
newItem.attachments.push({title:"Amazon.com Product Page", document:doc});
|
||||
|
||||
var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/b[@class="sans"]/text()[1]'';
|
||||
var title = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
|
||||
title = Zotero.Utilities.cleanString(title);
|
||||
if(title.lastIndexOf("(") != -1 && title.lastIndexOf(")") == title.length-1) {
|
||||
title = title.substring(0, title.lastIndexOf("(")-1);
|
||||
}
|
||||
newItem.title = title;
|
||||
|
||||
newItem.complete();
|
||||
}
|
||||
|
||||
function doWeb(doc, url) {
|
||||
var searchRe = new RegExp(''^http://www\.amazon\.com/(gp/search/|exec/obidos/search-handle-url/|s/)'');
|
||||
var m = searchRe.exec(doc.location.href)
|
||||
if(m) {
|
||||
var namespace = doc.documentElement.namespaceURI;
|
||||
var nsResolver = namespace ? function(prefix) {
|
||||
if (prefix == ''x'') return namespace; else return null;
|
||||
} : null;
|
||||
|
||||
// Why can''t amazon use the same stylesheets
|
||||
var xpath;
|
||||
if(m == "exec/obidos/search-handle-url/") {
|
||||
xpath = ''//table[@cellpadding="3"]'';
|
||||
} else {
|
||||
xpath = ''//table[@class="searchresults"]'';
|
||||
}
|
||||
|
||||
var searchresults = Zotero.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
|
||||
var items = Zotero.Utilities.getItemArray(doc, searchresults, ''^http://www\.amazon\.com/(gp/product/|exec/obidos/tg/detail/|[^/]+/dp/)'', ''^(Buy new|Hardcover|Paperback|Digital)$'');
|
||||
items = Zotero.selectItems(items);
|
||||
var suffixRe = new RegExp("http://(?:www\.)amazon\.([^/]+)/");
|
||||
var suffixMatch = suffixRe.exec(url);
|
||||
var suffix = suffixMatch[1];
|
||||
|
||||
var searchRe = new RegExp(''^http://www\.amazon\.'' + suffix + ''/(gp/search/|exec/obidos/search-handle-url/|s/)'');
|
||||
var m = searchRe.exec(doc.location.href);
|
||||
var uris = new Array();
|
||||
if (suffix == "co.jp"){
|
||||
suffix = "jp";
|
||||
}
|
||||
if(m) {
|
||||
var xpath = ''//a/span[@class="srTitle"]'';
|
||||
var elmts = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
|
||||
var elmt = elmts.iterateNext();
|
||||
var asins = new Array();
|
||||
var availableItems = new Array();
|
||||
var i = 0;
|
||||
var asinRe = new RegExp(''/(dp|product)/([^/]+)/'');
|
||||
|
||||
do {
|
||||
var link = Zotero.Utilities.getNodeString(doc, elmt, ''../@href'', nsResolver);
|
||||
var searchTitle = Zotero.Utilities.getNodeString(doc, elmt, ''./text()'', nsResolver);
|
||||
availableItems[i] = searchTitle;
|
||||
var asinMatch = asinRe.exec(link);
|
||||
asins[i] = asinMatch[2];
|
||||
Zotero.Utilities.debug(searchTitle + " @ " + asins[i]);
|
||||
i++;
|
||||
} while (elmt = elmts.iterateNext());
|
||||
var items = Zotero.selectItems(availableItems);
|
||||
|
||||
if(!items) {
|
||||
return true;
|
||||
}
|
||||
|
||||
var uris = new Array();
|
||||
for(var i in items) {
|
||||
uris.push(i);
|
||||
uris.push("http://ecs.amazonaws." + suffix + "/onca/xml?Service=AWSECommerceService&Version=2006-06-28&Operation=ItemLookup&SubscriptionId=0H174V5J5R5BE02YQN02&ItemId=" + asins[i] + "&ResponseGroup=ItemAttributes");
|
||||
}
|
||||
|
||||
Zotero.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
|
||||
function() { Zotero.done(); }, null);
|
||||
|
||||
Zotero.wait();
|
||||
} else {
|
||||
scrape(doc);
|
||||
var elmts = doc.evaluate(''//input[@name = "ASIN"]'', doc,
|
||||
nsResolver, XPathResult.ANY_TYPE, null);
|
||||
var elmt;
|
||||
while(elmt = elmts.iterateNext()) {
|
||||
var asin = Zotero.Utilities.getNodeString(doc, elmt, ''./@value'', nsResolver);
|
||||
}
|
||||
uris.push("http://ecs.amazonaws." + suffix + "/onca/xml?Service=AWSECommerceService&Version=2006-06-28&Operation=ItemLookup&SubscriptionId=0H174V5J5R5BE02YQN02&ItemId=" + asin + "&ResponseGroup=ItemAttributes");
|
||||
}
|
||||
|
||||
Zotero.Utilities.HTTP.doGet(uris, function(text) {
|
||||
text = text.replace(/<!DOCTYPE[^>]*>/, "").replace(/<\?xml[^>]*\?>/, "");
|
||||
var texts = text.split("<Items>");
|
||||
texts = texts[1].split("</ItemLookupResponse>");
|
||||
text = "<Items>" + texts[0];
|
||||
var xml = new XML(text);
|
||||
|
||||
var publisher = "";
|
||||
if (xml..Publisher.length()){
|
||||
publisher = Zotero.Utilities.cleanString(xml..Publisher[0].text().toString());
|
||||
}
|
||||
|
||||
var binding = "";
|
||||
if (xml..Binding.length()){
|
||||
binding = Zotero.Utilities.cleanString(xml..Binding[0].text().toString());
|
||||
}
|
||||
|
||||
var productGroup = "";
|
||||
if (xml..ProductGroup.length()){
|
||||
productGroup = Zotero.Utilities.cleanString(xml..ProductGroup[0].text().toString());
|
||||
}
|
||||
|
||||
if (productGroup=="Book") {
|
||||
var newItem = new Zotero.Item("book");
|
||||
newItem.publisher = publisher;
|
||||
}
|
||||
else if (productGroup == "Music") {
|
||||
var newItem = new Zotero.Item("audioRecording");
|
||||
newItem.label = publisher;
|
||||
newItem.audioRecordingType = binding;
|
||||
for(var i=0; i<xml..Artist.length(); i++) {
|
||||
newItem.creators.push(Zotero.Utilities.cleanAuthor(xml..Artist[i].text().toString(), "performer"));
|
||||
}
|
||||
}
|
||||
else if (productGroup == "DVD" | productGroup == "Video") {
|
||||
var newItem = new Zotero.Item("videoRecording");
|
||||
newItem.studio = publisher;
|
||||
newItem.videoRecordingType = binding;
|
||||
for(var i=0; i<xml..Actor.length(); i++) {
|
||||
newItem.creators.push(Zotero.Utilities.cleanAuthor(xml..Actor[i].text().toString(), "castMember"));
|
||||
}
|
||||
for(var i=0; i<xml..Director.length(); i++) {
|
||||
newItem.creators.push(Zotero.Utilities.cleanAuthor(xml..Director[i].text().toString(), "director"));
|
||||
}
|
||||
}
|
||||
else{
|
||||
var newItem = new Zotero.Item("book");
|
||||
newItem.publisher = publisher;
|
||||
}
|
||||
|
||||
if(xml..RunningTime.length()){
|
||||
newItem.runningTime = Zotero.Utilities.cleanString(xml..RunningTime[0].text().toString());
|
||||
}
|
||||
|
||||
// Retrieve authors and other creators
|
||||
for(var i=0; i<xml..Author.length(); i++) {
|
||||
newItem.creators.push(Zotero.Utilities.cleanAuthor(xml..Author[i].text().toString()));
|
||||
}
|
||||
if (newItem.creators.length == 0){
|
||||
for(var i=0; i<xml..Creator.length(); i++) {
|
||||
newItem.creators.push(Zotero.Utilities.cleanAuthor(xml..Creator[i].text().toString()));
|
||||
}
|
||||
}
|
||||
|
||||
if (xml..PublicationDate.length()){
|
||||
newItem.date = Zotero.Utilities.cleanString(xml..PublicationDate[0].text().toString());
|
||||
} else if (xml..ReleaseDate.length()){
|
||||
newItem.date = Zotero.Utilities.cleanString(xml..ReleaseDate[0].text().toString());
|
||||
}
|
||||
if (xml..Edition.length()){
|
||||
newItem.edition = Zotero.Utilities.cleanString(xml..Edition[0].text().toString());
|
||||
}
|
||||
if (xml..ISBN.length()){
|
||||
newItem.ISBN = Zotero.Utilities.cleanString(xml..ISBN[0].text().toString());
|
||||
}
|
||||
if (xml..NumberOfPages.length()){
|
||||
newItem.pages = Zotero.Utilities.cleanString(xml..NumberOfPages[0].text().toString());
|
||||
}
|
||||
var title = Zotero.Utilities.cleanString(xml..Title[0].text().toString());
|
||||
if(title.lastIndexOf("(") != -1 && title.lastIndexOf(")") == title.length-1) {
|
||||
title = title.substring(0, title.lastIndexOf("(")-1);
|
||||
}
|
||||
if (xml..ASIN.length()){
|
||||
newItem.url = "http://www.amazon." + suffix + "/dp/" + Zotero.Utilities.cleanString(xml..ASIN[0].text().toString());
|
||||
}
|
||||
|
||||
if (xml..OriginalReleaseDate.length()){
|
||||
newItem.extra = newItem.pages = Zotero.Utilities.cleanString(xml..OriginalReleaseDate[0].text().toString());
|
||||
}
|
||||
|
||||
Zotero.Utilities.debug("item title: " + title);
|
||||
newItem.title = title;
|
||||
newItem.complete();
|
||||
}, function() {Zotero.done();}, null);
|
||||
Zotero.wait();
|
||||
}');
|
||||
|
||||
REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-10-02 17:00:00', 1, 100, 4, 'WorldCat', 'Simon Kornblith', '^http://(?:new)?firstsearch\.oclc\.org/WebZ/',
|
||||
|
|
Loading…
Reference in New Issue
Block a user