From c27ff75e80dae96c88779e9490eab798989e471a Mon Sep 17 00:00:00 2001
From: Dan Stillman <dstillman@zotero.org>
Date: Thu, 26 May 2016 00:31:34 -0400
Subject: [PATCH] Use Reader Mode for HTML snapshots if possible

WebPageDump hasn't been updated in years, it can result in
multi-megabyte snapshots on pages with lots of JS and ads, and there are
often rendering problems viewing snapshots of JS-heavy pages. ScrapBook
X, a fork of ScrapBook (which WBP was based on), is actively maintained
and works much better, but like ScrapBook it's not modular and it can
produce massive snapshots (e.g., 12MB for a single page with lots of
ads).

Rather than produce ugly, broken-looking pages, we can run eligible
pages through Firefox's Reader Mode (a modified Readability under the
hood) and save those. While this won't always be perfect, most of the
time it will save what people actually care about - the text of the page
-- and it avoids filling up people's storage directories and storage
accounts with junk. We should probably reduce the number of translators
that save snapshots in general, but this lessens their impact.

The current implementation will need to be updated for Standalone,
either by including the Reader Mode files in Standalone or switching to
other JS libraries. This strips the standard controls before saving, but
it might be nice to provide some format options when viewing (or just
run the page through Reader Mode again). We can also customize the
default styling.
---
 chrome/content/zotero/xpcom/attachments.js    | 67 ++++++++++++++++++-
 .../xpcom/translation/translate_item.js       | 27 ++++++--
 test/tests/server_connectorTest.js            | 64 +++++++++++++++++-
 3 files changed, 150 insertions(+), 8 deletions(-)

diff --git a/chrome/content/zotero/xpcom/attachments.js b/chrome/content/zotero/xpcom/attachments.js
index 2683403cd..a5c2783ee 100644
--- a/chrome/content/zotero/xpcom/attachments.js
+++ b/chrome/content/zotero/xpcom/attachments.js
@@ -586,9 +586,70 @@ Zotero.Attachments = new function(){
 			}
 		}
 		
+		var browser;
+		
 		if (contentType === 'text/html' || contentType === 'application/xhtml+xml') {
+			// If page looks like an article, run it through Reader Mode
+			try {
+				Components.utils.import("resource://gre/modules/ReaderMode.jsm")
+				if (ReaderMode.isProbablyReaderable(document)) {
+					Zotero.debug("Document is probably readerable");
+					browser = Zotero.Browser.createHiddenBrowser();
+					let loadDeferred = Zotero.Promise.defer();
+					browser.addEventListener('DOMContentLoaded', () => loadDeferred.resolve());
+					// Use Firefox's Reader Mode template
+					browser.loadURI("chrome://global/content/reader/aboutReader.html");
+					yield loadDeferred.promise;
+					let doc = browser.contentDocument;
+					
+					let article = yield ReaderMode.parseDocument(document);
+					
+					let scriptElement = doc.getElementsByTagName('script')[0];
+					let toolbarElement = doc.getElementById('reader-toolbar');
+					let headerElement = doc.getElementById('reader-header');
+					let domainElement = doc.getElementById('reader-domain');
+					let titleElement = doc.getElementById('reader-title');
+					let bylineElement = doc.getElementById('reader-credits');
+					let contentElement = doc.getElementById('moz-reader-content');
+					let footerElement = doc.getElementById('reader-footer');
+					
+					// Remove Reader Mode controls
+					[scriptElement, toolbarElement, domainElement, footerElement].forEach(elem => {
+						elem.parentNode.removeChild(elem);
+					});
+					
+					// The following logic more or less follows AboutReader.jsm::_showContent()
+					doc.title = article.title;
+					titleElement.textContent = article.title;
+					
+					if (article.byline) {
+						bylineElement.textContent = article.byline;
+					}
+					
+					headerElement.style.display = 'block';
+					
+					let parserUtils = Cc["@mozilla.org/parserutils;1"].getService(Ci.nsIParserUtils);
+					let contentFragment = parserUtils.parseFragment(
+						article.content,
+						Ci.nsIParserUtils.SanitizerDropForms | Ci.nsIParserUtils.SanitizerAllowStyle,
+						false,
+						Services.io.newURI(url, null, null),
+						contentElement
+					);
+					contentElement.innerHTML = "";
+					contentElement.appendChild(contentFragment);
+					
+					contentElement.style.display = 'block';
+					
+					document = doc;
+				}
+			}
+			catch (e) {
+				Zotero.logError(e);
+			}
+			
 			// Load WebPageDump code
-			var wpd = {"Zotero":Zotero};
+			var wpd = { Zotero };
 			Components.classes["@mozilla.org/moz/jssubscript-loader;1"]
 				.getService(Components.interfaces.mozIJSSubScriptLoader)
 				.loadSubScript("chrome://zotero/content/webpagedump/common.js", wpd);
@@ -688,6 +749,10 @@ Zotero.Attachments = new function(){
 			yield Zotero.Fulltext.indexDocument(document, attachmentItem.id);
 		}
 		
+		if (browser) {
+			Zotero.Browser.deleteHiddenBrowser(browser);
+		}
+		
 		return attachmentItem;
 	});
 
diff --git a/chrome/content/zotero/xpcom/translation/translate_item.js b/chrome/content/zotero/xpcom/translation/translate_item.js
index 0c3084d4e..f959bdab6 100644
--- a/chrome/content/zotero/xpcom/translation/translate_item.js
+++ b/chrome/content/zotero/xpcom/translation/translate_item.js
@@ -554,12 +554,27 @@ Zotero.Translate.ItemSaver.prototype = {
 			Zotero.debug('Importing attachment from document');
 			attachment.linkMode = "imported_url";
 			
-			return Zotero.Attachments.importFromDocument({
-				libraryID: this._libraryID,
-				document: attachment.document,
-				parentItemID: parentID,
-				title: title
-			});
+			let url = attachment.document.location.href;
+			let html = attachment.document.documentElement.innerHTML;
+			
+			// Save via connector server to address bug saving document directly
+			Zotero.Server.Connector.Data[url] = "<html>" + html + "</html>";
+			let deferred = Zotero.Promise.defer();
+			Zotero.HTTP.processDocuments(
+				["zotero://connector/" + encodeURIComponent(url)],
+				function (doc) {
+					delete Zotero.Server.Connector.Data[url];
+					
+					return Zotero.Attachments.importFromDocument({
+						document: doc,
+						parentItemID: parentID,
+						title: doc.title
+					})
+					.then(attachment => deferred.resolve(attachment))
+					.catch(e => deferred.reject(e));
+				}
+			);
+			return deferred.promise;
 		}
 		
 		// Import from URL
diff --git a/test/tests/server_connectorTest.js b/test/tests/server_connectorTest.js
index 2591ccc3c..16331b956 100644
--- a/test/tests/server_connectorTest.js
+++ b/test/tests/server_connectorTest.js
@@ -104,7 +104,7 @@ describe("Connector Server", function () {
 	});
 	
 	describe("/connector/saveSnapshot", function () {
-		it("should save a webpage item and snapshot to the current selected collection", function* () {
+		it("should save a webpage item and non-readerable snapshot to the current selected collection", function* () {
 			var collection = yield createDataObject('collection');
 			yield waitForItemsLoad(win);
 			
@@ -144,6 +144,68 @@ describe("Connector Server", function () {
 			item = Zotero.Items.get(ids2[0]);
 			assert.isTrue(item.isImportedAttachment());
 			assert.equal(item.getField('title'), 'Title');
+			var path = yield item.getFilePathAsync();
+			var fileContents = yield Zotero.File.getContentsAsync(path);
+			assert.notInclude(fileContents, 'moz-reader-content');
+		});
+		
+		it("should save a webpage item and readerable snapshot to the current selected collection", function* () {
+			var collection = yield createDataObject('collection');
+			yield waitForItemsLoad(win);
+			
+			var pageTitle = "Page Title";
+			var articleTitle = "Article Title";
+			var byline = "First Last";
+			var content = "<p>" + new Array(50).fill("").map(x => Zotero.Utilities.randomString()).join(" ") + "</p>"
+				+ "<p>" + new Array(50).fill("").map(x => Zotero.Utilities.randomString()).join(" ") + "</p>"
+				+ "<p>" + new Array(50).fill("").map(x => Zotero.Utilities.randomString()).join(" ") + "</p>"
+			
+			// saveSnapshot saves parent and child before returning
+			var ids1, ids2;
+			var promise = waitForItemEvent('add').then(function (ids) {
+				ids1 = ids;
+				return waitForItemEvent('add').then(function (ids) {
+					ids2 = ids;
+				});
+			});
+			yield Zotero.HTTP.request(
+				'POST',
+				connectorServerPath + "/connector/saveSnapshot",
+				{
+					headers: {
+						"Content-Type": "application/json"
+					},
+					body: JSON.stringify({
+						url: "http://example.com/articles/1",
+						html: `<html><head><title>${pageTitle}</title><body>`
+							+ "<article>"
+							+ `<h1>${articleTitle}</h1>`
+							+ `<p class="author">${byline}</p>`
+							+ content
+							+ "</article>"
+							+ "</body></html>"
+					})
+				}
+			);
+			
+			assert.isTrue(promise.isFulfilled());
+			
+			// Check parent item
+			assert.lengthOf(ids1, 1);
+			var item = Zotero.Items.get(ids1[0]);
+			assert.equal(Zotero.ItemTypes.getName(item.itemTypeID), 'webpage');
+			assert.isTrue(collection.hasItem(item.id));
+			assert.equal(item.getField('title'), pageTitle);
+			
+			// Check attachment
+			assert.lengthOf(ids2, 1);
+			item = Zotero.Items.get(ids2[0]);
+			assert.isTrue(item.isImportedAttachment());
+			assert.equal(item.getField('title'), pageTitle);
+			var path = yield item.getFilePathAsync();
+			var fileContents = yield Zotero.File.getContentsAsync(path);
+			assert.include(fileContents, 'moz-reader-content');
+			assert.include(fileContents, content.match(/[a-z]+/i)[0]);
 		});
 		
 		it("should save a PDF to the current selected collection", function* () {