From c5ec016ed9a1fe9e1c8ac87d9c0100927f3adf56 Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Tue, 12 Dec 2006 00:28:49 +0000 Subject: [PATCH] - closes #327, scrapers should either take snapshots or use URL field - closes #351, scrapers with PDF downloads should use downloadAssociatedFiles instead of automaticSnapshots there are some problems with snapshot titles. see bug #436. --- chrome/content/zotero/xpcom/translate.js | 48 +++++--- chrome/content/zotero/xpcom/utilities.js | 2 +- defaults/preferences/zotero.js | 2 +- scrapers.sql | 144 ++++++++++++----------- 4 files changed, 109 insertions(+), 87 deletions(-) diff --git a/chrome/content/zotero/xpcom/translate.js b/chrome/content/zotero/xpcom/translate.js index b0f6e4804..7aee7d351 100644 --- a/chrome/content/zotero/xpcom/translate.js +++ b/chrome/content/zotero/xpcom/translate.js @@ -958,7 +958,7 @@ Zotero.Translate.prototype._generateErrorString = function(error) { // TODO: Currently using automaticSnapshots pref for everything // Eventually downloadAssociatedFiles may be a separate pref // for PDFs and other large files - //+ "\nextensions.zotero.downloadAssociatedFiles => "+Zotero.Prefs.get("downloadAssociatedFiles"); + + "\nextensions.zotero.downloadAssociatedFiles => "+Zotero.Prefs.get("downloadAssociatedFiles"); + "\nextensions.zotero.automaticSnapshots => "+Zotero.Prefs.get("automaticSnapshots"); return errorString.substr(1); } @@ -1216,20 +1216,44 @@ Zotero.Translate.prototype._itemDone = function(item, attachedTo) { } // handle attachments - if(item.attachments) { + if(item.attachments && Zotero.Prefs.get("automaticSnapshots")) { + Zotero.debug("HANDLING ATTACHMENTS"); for each(var attachment in item.attachments) { if(this.type == "web") { if(!attachment.url && !attachment.document) { Zotero.debug("not adding attachment: no URL specified"); - } else if(attachment.downloadable && this._downloadAssociatedFiles) { - if(attachment.document) { - Zotero.Attachments.importFromDocument(attachment.document, myID, attachment.title); - } else { - Zotero.Attachments.importFromURL(attachment.url, myID, - (attachment.mimeType ? attachment.mimeType : attachment.document.contentType), - (attachment.title ? attachment.title : attachment.document.title)); - } } else { + if(attachment.document + || (attachment.mimeType && attachment.mimeType == "text/html") + || Zotero.Prefs.get("downloadAssociatedFiles")) { + if(attachment.document) { + Zotero.Attachments.importFromDocument(attachment.document, myID, attachment.title); + } else { + Zotero.debug("GOT ATTACHMENT"); + Zotero.debug(attachment); + + var mimeType = null; + var title = null; + + if(attachment.mimeType) { + // first, try to extract mime type from mimeType attribute + mimeType = attachment.mimeType; + } else if(attachment.document && attachment.document.contentType) { + // if that fails, use document if possible + mimeType = attachment.document.contentType + } + + // same procedure for title as mime type + if(attachment.title) { + title = attachment.title; + } else if(attachment.document && attachment.document.title) { + title = attachment.document.title; + } + + Zotero.Attachments.importFromURL(attachment.url, myID, + mimeType, title); + } + } // links no longer exist, so just don't save them /*if(attachment.document) { attachmentID = Zotero.Attachments.linkFromURL(attachment.document.location.href, myID, @@ -1363,10 +1387,6 @@ Zotero.Translate.prototype._runHandler = function(type, argument) { * does the actual web translation */ Zotero.Translate.prototype._web = function() { - // TODO: Currently using automaticSnapshots for everything - //this._downloadAssociatedFiles = Zotero.Prefs.get("downloadAssociatedFiles"); - this._downloadAssociatedFiles = Zotero.Prefs.get("automaticSnapshots"); - try { this._sandbox.doWeb(this.document, this.location); } catch(e) { diff --git a/chrome/content/zotero/xpcom/utilities.js b/chrome/content/zotero/xpcom/utilities.js index c1a1c1e63..b2ed05dda 100644 --- a/chrome/content/zotero/xpcom/utilities.js +++ b/chrome/content/zotero/xpcom/utilities.js @@ -418,7 +418,7 @@ Zotero.Utilities.Ingester.HTTP.prototype.doGet = function(urls, processor, done) Zotero.Utilities.HTTP.doGet(url, function(xmlhttp) { try { if(processor) { - processor(xmlhttp.responseText, xmlhttp); + processor(xmlhttp.responseText, xmlhttp, url); } if(callAgain) { diff --git a/defaults/preferences/zotero.js b/defaults/preferences/zotero.js index b803b8187..ee9bd491e 100644 --- a/defaults/preferences/zotero.js +++ b/defaults/preferences/zotero.js @@ -10,6 +10,6 @@ pref("extensions.zotero.openURL.resolver","http://athene.gmu.edu:8888/lfp/LinkFi pref("extensions.zotero.openURL.version","0.1"); pref("extensions.zotero.parseEndNoteMIMETypes",true); pref("extensions.zotero.automaticSnapshots",true); -//pref("extensions.zotero.downloadAssociatedFiles",false); +pref("extensions.zotero.downloadAssociatedFiles",false); pref("extensions.zotero.reportTranslationFailure",true); pref("extensions.zotero.lastCreatorFieldMode",0); diff --git a/scrapers.sql b/scrapers.sql index a2a3687ee..4af6d2047 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -1,4 +1,4 @@ --- 119 +-- 120 -- ***** BEGIN LICENSE BLOCK ***** -- @@ -22,7 +22,7 @@ -- Set the following timestamp to the most recent scraper update date -REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-12-11 15:57:00')); +REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-12-11 18:37:00')); REPLACE INTO translators VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '1.0.0b3.r1', '', '2006-12-11 11:24:00', 1, 100, 4, 'Amazon', 'Sean Takats', '^http://(?:www\.)amazon', 'function detectWeb(doc, url) { @@ -540,7 +540,7 @@ REPLACE INTO translators VALUES ('88915634-1af6-c134-0171-56fd198235ed', '1.0.0b Zotero.wait(); }'); -REPLACE INTO translators VALUES ('d921155f-0186-1684-615c-ca57682ced9b', '1.0.0b3.r1', '', '2006-11-20 23:10:00', 1, 100, 4, 'JSTOR', 'Simon Kornblith', '^http://www\.jstor\.org/(?:view|browse|search/)', +REPLACE INTO translators VALUES ('d921155f-0186-1684-615c-ca57682ced9b', '1.0.0b3.r1', '', '2006-12-11 17:48:00', 1, 100, 4, 'JSTOR', 'Simon Kornblith', '^http://www\.jstor\.org/(?:view|browse|search/)', 'function detectWeb(doc, url) { var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { @@ -564,8 +564,7 @@ REPLACE INTO translators VALUES ('d921155f-0186-1684-615c-ca57682ced9b', '1.0.0b var m = viewRe.exec(viewURL); if(m) { return {url:m[1]+"cgi-bin/jstor/printpage"+m[2]+".pdf?dowhat=Acrobat", - mimeType:"application/pdf", title:"JSTOR Full Text PDF", - downloadable:true}; + mimeType:"application/pdf", title:"JSTOR Full Text PDF"}; } else { return false; } @@ -573,8 +572,13 @@ REPLACE INTO translators VALUES ('d921155f-0186-1684-615c-ca57682ced9b', '1.0.0b function itemComplete(newItem, url) { if(newItem.url) { - newItem.attachments.push({url:newItem.url, mimeType:"text/html", - title:"JSTOR Web-Readable Version"}); + if(useSnapshot) { + newItem.attachments.push({document:useSnapshot, + title:"JSTOR Snapshot"}); + } else { + newItem.attachments.push({url:newItem.url, mimeType:"text/html", + title:"JSTOR Snapshot"}); + } } else { if(newItem.ISSN) { newItem.url = "http://www.jstor.org/browse/"+newItem.ISSN; @@ -586,6 +590,8 @@ function itemComplete(newItem, url) { newItem.complete(); } +var useSnapshot = false; + function doWeb(doc, url) { var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { @@ -654,6 +660,8 @@ function doWeb(doc, url) { } else { throw("Could not find citation save links"); } + + useSnapshot = doc; } Zotero.Utilities.HTTP.doGet(''http://www.jstor.org/browse?citationAction=removeAll&confirmRemAll=on&viewCitations=1'', function() { // clear marked @@ -730,7 +738,7 @@ function doWeb(doc, url) { Zotero.wait(); }'); -REPLACE INTO translators VALUES ('e85a3134-8c1a-8644-6926-584c8565f23e', '1.0.0b2.r2', '', '2006-10-23 00:23:00', 1, 100, 4, 'History Cooperative', 'Simon Kornblith', '^http://www\.historycooperative\.org/(?:journals/.+/.+/.+\.s?html$|cgi-bin/search.cgi)', +REPLACE INTO translators VALUES ('e85a3134-8c1a-8644-6926-584c8565f23e', '1.0.0b2.r2', '', '2006-12-11 18:01:00', 1, 100, 4, 'History Cooperative', 'Simon Kornblith', '^http://www\.historycooperative\.org/(?:journals/.+/.+/.+\.s?html$|cgi-bin/search.cgi)', 'function detectWeb(doc, url) { if(doc.title == "History Cooperative: Search Results") { return "multiple"; @@ -776,8 +784,7 @@ function scrape(doc) { newItem.date = month.getAttribute("content")+" "+year.getAttribute("content"); } - newItem.attachments.push({document:doc, title:"History Cooperative Full Text", - downloadable:true}); + newItem.attachments.push({document:doc, title:"History Cooperative Snapshot"}); newItem.complete(); } @@ -1298,7 +1305,7 @@ function doWeb(doc, url){ }'); -REPLACE INTO translators VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '1.0.0b3.r1', '', '2006-12-11 11:27:00', 1, 100, 4, 'ProQuest', 'Simon Kornblith', '^http://[^/]+/pqdweb\?((?:.*\&)?did=.*&Fmt=[0-9]|(?:.*\&)Fmt=[0-9].*&did=|(?:.*\&)searchInterface=)', +REPLACE INTO translators VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '1.0.0b3.r1', '', '2006-12-11 18:02:00', 1, 100, 4, 'ProQuest', 'Simon Kornblith', '^http://[^/]+/pqdweb\?((?:.*\&)?did=.*&Fmt=[0-9]|(?:.*\&)Fmt=[0-9].*&did=|(?:.*\&)searchInterface=)', 'function detectWeb(doc, url) { var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { @@ -1440,10 +1447,10 @@ REPLACE INTO translators VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '1.0.0b // figure out what we can attach var attachArray = { - ''//td[@class="textSmall"]//img[@alt="Full Text - PDF"]'':"ProQuest Full Text (PDF)", - ''//td[@class="textSmall"]//img[@alt="Text+Graphics"]'':"ProQuest Full Text (HTML with Graphics)", - ''//td[@class="textSmall"]//img[@alt="Full Text"]'':"ProQuest Full Text (HTML)", - ''//td[@class="textSmall"]//img[@alt="Abstract"]'':"ProQuest Abstract" + ''//td[@class="textSmall"]//img[@alt="Full Text - PDF"]'':"ProQuest Full Text PDF", + ''//td[@class="textSmall"]//img[@alt="Text+Graphics"]'':"ProQuest Snapshot (HTML with Graphics)", + ''//td[@class="textSmall"]//img[@alt="Full Text"]'':"ProQuest Snapshot (HTML)", + ''//td[@class="textSmall"]//img[@alt="Abstract"]'':"ProQuest Snapshot (Abstract)" } for(var xpath in attachArray) { var item = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); @@ -1452,13 +1459,21 @@ REPLACE INTO translators VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '1.0.0b if(item.parentNode.tagName.toLowerCase() == "a") { // item is not this page - newItem.attachments.push({url:item.parentNode.href, - title:title, mimeType:(title == "ProQuest Full Text (PDF)" ? "application/pdf" : "text/html"), - downloadable:true}); + if(title == "ProQuest Full Text PDF") { + // PDF gets different mime type and downloadability + newItem.attachments.push({url:item.parentNode.href, + title:title, mimeType:"application/pdf"}); + } else { + newItem.attachments.push({url:item.parentNode.href, + title:title, mimeType:"text/html"}); + } } else { // item is this page - newItem.attachments.push({document:doc, title:title, downloadable:true}); + newItem.attachments.push({document:doc, title:title}); } + + // only snapshot one of the possible types + if(title != "ProQuest Snapshot (PDF)") break; } } @@ -1524,7 +1539,7 @@ function doWeb(doc, url) { } }'); -REPLACE INTO translators VALUES ('6773a9af-5375-3224-d148-d32793884dec', '1.0.0b3.r1', '', '2006-10-02 17:00:00', 1, 100, 4, 'InfoTrac College Edition', 'Simon Kornblith', '^http://infotrac-college\.thomsonlearning\.com/itw/infomark/', +REPLACE INTO translators VALUES ('6773a9af-5375-3224-d148-d32793884dec', '1.0.0b3.r1', '', '2006-12-11 18:04:00', 1, 100, 4, 'InfoTrac College Edition', 'Simon Kornblith', '^http://infotrac-college\.thomsonlearning\.com/itw/infomark/', 'function detectWeb(doc, url) { if(doc.title.substring(0, 8) == "Article ") { return "magazineArticle"; @@ -1607,11 +1622,10 @@ REPLACE INTO translators VALUES ('6773a9af-5375-3224-d148-d32793884dec', '1.0.0b } if(doc) { - newItem.attachments.push({document:doc, title:"InfoTrac Full Text", - downloadable:true}); + newItem.attachments.push({document:doc, title:"InfoTrac Snapshot"}); } else { - newItem.attachments.push({url:url, title:"InfoTrac Full Text", - mimeType:"text/html", downloadable:true}); + newItem.attachments.push({url:url, title:"InfoTrac Snapshot", + mimeType:"text/html"}); } newItem.complete(); @@ -1667,7 +1681,7 @@ function doWeb(doc, url) { } }'); -REPLACE INTO translators VALUES ('63c25c45-6257-4985-9169-35b785a2995e', '1.0.0b2.r2', '', '2006-10-02 17:00:00', 1, 100, 4, 'InfoTrac OneFile', 'Simon Kornblith', '^https?://[^/]+/itx/(?:[a-z]+Search|retrieve|paginate|tab)\.do', +REPLACE INTO translators VALUES ('63c25c45-6257-4985-9169-35b785a2995e', '1.0.0b2.r2', '', '2006-12-11 18:04:00', 1, 100, 4, 'InfoTrac OneFile', 'Simon Kornblith', '^https?://[^/]+/itx/(?:[a-z]+Search|retrieve|paginate|tab)\.do', 'function detectWeb(doc, url) { var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { @@ -1797,8 +1811,7 @@ REPLACE INTO translators VALUES ('b047a13c-fe5c-6604-c997-bef15e502b09', '1.0.0b }', 'function scrape(doc) { var newItem = new Zotero.Item(); - newItem.attachments.push({document:doc, title:"LexisNexis Full Text", - downloadable:true}); + newItem.attachments.push({document:doc, title:"LexisNexis Snapshot"}); var citationDataDiv; var divs = doc.getElementsByTagName("div"); @@ -2436,7 +2449,7 @@ REPLACE INTO translators VALUES ('0f9fc2fc-306e-5204-1117-25bca009dffc', '1.0.0b Zotero.wait(); }'); -REPLACE INTO translators VALUES ('c54d1932-73ce-dfd4-a943-109380e06574', '1.0.0b3.r1', '', '2006-10-02 17:00:00', 1, 100, 4, 'Project MUSE', 'Simon Kornblith', '^http://muse\.jhu\.edu/(?:journals/[^/]+/[^/]+/[^/]+\.html|search/pia.cgi)', +REPLACE INTO translators VALUES ('c54d1932-73ce-dfd4-a943-109380e06574', '1.0.0b3.r1', '', '2006-12-11 18:09:00', 1, 100, 4, 'Project MUSE', 'Simon Kornblith', '^http://muse\.jhu\.edu/(?:journals/[^/]+/[^/]+/[^/]+\.html|search/pia.cgi)', 'function detectWeb(doc, url) { var searchRe = new RegExp("^http://[^/]+/search/pia\.cgi"); if(searchRe.test(url)) { @@ -2476,14 +2489,12 @@ REPLACE INTO translators VALUES ('c54d1932-73ce-dfd4-a943-109380e06574', '1.0.0b for(var i=0; i