From f1cc809f76f412eeb162132d26ce44be070724f8 Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Mon, 26 Jun 2006 20:44:45 +0000 Subject: [PATCH] Add a generic scraper that will scrape any website, although it may not always find very much information. It looks at META tags, both Dublin Core and otherwise. When tags are ready, we can pull out META keywords. --- scrapers.sql | 49 +++++++++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/scrapers.sql b/scrapers.sql index 60feb8f64..1c6e0c8f5 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -1,7 +1,7 @@ --- 26 +-- 27 -- Set the following timestamp to the most recent scraper update date -REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-26 16:01:00')); +REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-26 16:41:00')); REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-26 16:01:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/)', 'if(doc.title.indexOf("search") >= 0) { @@ -794,7 +794,6 @@ if(newUri) { var m = urlRe.exec(urls[0]); var clearUrl = m[0]+"?clear_saves=1"; var postUrl = m[0]; - var exportUrl = m[1]+"++export/1,-1,-1,B/export"; var actionUrl = m[2]+m[3]; var postString = ""; @@ -2325,40 +2324,54 @@ utilities.HTTPUtilities.doGet(newUri, null, function(text) { wait();'); -REPLACE INTO "scrapers" VALUES('951c027d-74ac-47d4-a107-9c3069ab7b48', '2006-06-26 16:01:00', 'Scraper for Dublin Core expressed as HTML META elements', 'Simon Kornblith', +REPLACE INTO "scrapers" VALUES('951c027d-74ac-47d4-a107-9c3069ab7b48', '2006-06-26 16:41:00', 'Generic Scraper', 'Simon Kornblith', '', 'return "website";', -'var metaTags = doc.getElementsByTagName("meta"); - -if(metaTags) { - for(var i=0; i