-- 31 -- Set the following timestamp to the most recent scraper update date REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-28 23:08:00')); REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-28 23:08:00', 3, 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/)', 'if(doc.location.href.indexOf("search") >= 0) { return "multiple"; } else { return "book"; } ', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; function scrape(doc) { uri = doc.location.href; // Retrieve authors var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/a''; var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); for (var i = 0; i < elmts.length; i++) { var elmt = elmts[i]; model.addStatement(uri, prefixDC + ''creator'', utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue), false); // Use your own type here } // Retrieve data from "Product Details" box var xpath = ''/html/body/table/tbody/tr/td[2]/table/tbody/tr/td[@class="bucket"]/div[@class="content"]/ul/li''; var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); for (var i = 0; i < elmts.length; i++) { var elmt = elmts[i]; var attribute = utilities.cleanString(utilities.getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue); if(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver)) { var value = utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue); if(attribute == "Publisher:") { if(value.lastIndexOf("(") != -1) { var date = value.substring(value.lastIndexOf("(")+1, value.length-1); jsDate = new Date(date); if(!isNaN(jsDate.valueOf())) { date = utilities.dateToISO(jsDate); } value = value.substring(0, value.lastIndexOf("(")-1); } if(value.lastIndexOf(";") != -1) { var edition = value.substring(value.lastIndexOf(";")+2, value.length); value = value.substring(0, value.lastIndexOf(";")); } model.addStatement(uri, prefixDC + ''publisher'', value); model.addStatement(uri, prefixDC + ''date'', date); model.addStatement(uri, prefixDC + ''hasVersion'', edition); } else if(attribute == "Language:") { model.addStatement(uri, prefixDC + ''language'', value); } else if(attribute == "ISBN:") { model.addStatement(uri, prefixDC + ''identifier'', ''ISBN ''+value); } else if(value.substring(value.indexOf(" ")+1, value.length) == "pages") { model.addStatement(uri, prefixDummy + ''pages'', value.substring(0, value.indexOf(" "))); model.addStatement(uri, prefixDC + ''medium'', attribute.substring(0, attribute.indexOf(":"))); } } } var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/b[@class="sans"]''; var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); var title = utilities.cleanString(utilities.getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue); if(title.lastIndexOf("(") != -1 && title.lastIndexOf(")") == title.length-1) { title = title.substring(0, title.lastIndexOf("(")-1); } model.addStatement(uri, prefixDC + ''title'', title); model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false); } var searchRe = new RegExp(''^http://www\.amazon\.com/(gp/search/|exec/obidos/search-handle-url/)''); var m = searchRe.exec(doc.location.href) if(m) { // Why can''t amazon use the same stylesheets var xpath; if(m == "gp/search/") { xpath = ''//table[@class="searchresults"]''; } else { xpath = ''//table[@cellpadding="3"]''; } var searchresults = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); var items = utilities.getItemArray(doc, searchresults, ''^http://www\.amazon\.com/(gp/product/|exec/obidos/tg/detail/)'', ''^(Buy new|Hardcover|Paperback|Digital)$''); items = utilities.selectItems(items); if(!items) { return true; } var uris = new Array(); for(i in items) { uris.push(i); } utilities.processDocuments(browser, null, uris, function(browser) { scrape(browser.contentDocument) }, function() { done(); }, function() {}); wait(); } else { scrape(doc); }'); REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-26 16:01:00', 3, 'WorldCat Scraper', 'Simon Kornblith', '^http://(?:new)?firstsearch\.oclc\.org/WebZ/', 'if(doc.title == ''FirstSearch: WorldCat Detailed Record'') { return "book"; } else if(doc.title == ''FirstSearch: WorldCat List of Records'') { return "multiple"; } return false;', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; var sessionRegexp = /(?:\?|\:)sessionid=([^?:]+)(?:\?|\:|$)/; var numberRegexp = /(?:\?|\:)recno=([^?:]+)(?:\?|\:|$)/; var resultsetRegexp = /(?:\?|\:)resultset=([^?:]+)(?:\?|\:|$)/; var hostRegexp = new RegExp("http://([^/]+)/"); var uri = doc.location.href; var sMatch = sessionRegexp.exec(uri); var sessionid = sMatch[1]; var hMatch = hostRegexp.exec(uri); var host = hMatch[1]; var newUri, exportselect; if(doc.title == ''FirstSearch: WorldCat Detailed Record'') { var publisherRegexp = /^(.*), (.*?),?$/; var nMatch = numberRegexp.exec(uri); if(nMatch) { var number = nMatch[1]; } else { number = 1; } var rMatch = resultsetRegexp.exec(uri); if(rMatch) { var resultset = rMatch[1]; } else { // It''s in an XPCNativeWrapper, so we have to do this black magic resultset = doc.forms.namedItem(''main'').elements.namedItem(''resultset'').value; } exportselect = ''record''; newUri = ''http://''+host+''/WebZ/DirectExport?numrecs=10:smartpage=directexport:entityexportnumrecs=10:entityexportresultset='' + resultset + '':entityexportrecno='' + number + '':sessionid='' + sessionid + '':entitypagenum=35:0''; var uris = new Array(newUri); } else { var items = utilities.getItemArray(doc, doc, ''/WebZ/FSFETCH\\?fetchtype=fullrecord'', ''^(See more details for locating this item|Detailed Record)$''); items = utilities.selectItems(items); if(!items) { return true; } // Set BookMark cookie for(i in items) { // Hack to get first item var myCookie = sessionid+":"; var rMatch = resultsetRegexp.exec(i); var resultset = rMatch[1]; break; } var uris = new Array(); for(i in items) { var nMatch = numberRegexp.exec(i); myCookie += resultset+"_"+nMatch[1]+","; uris.push(i); } myCookie = myCookie.substr(0, myCookie.length-1); doc.cookie = "BookMark="+myCookie; exportselect = ''marked''; newUri = ''http://''+host+''/WebZ/DirectExport?numrecs=10:smartpage=directexport:entityexportnumrecs=10:entityexportresultset='' + resultset + '':entityexportrecno=1:sessionid='' + sessionid + '':entitypagenum=29:0''; } utilities.HTTPUtilities.doPost(newUri, ''exportselect=''+exportselect+''&exporttype=plaintext'', null, function(text) { var lineRegexp = new RegExp(); lineRegexp.compile("^([\\w() ]+): *(.*)$"); var k = 0; var uri = uris[k]; model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false); var lines = text.split(''\n''); for(var i=0;i/; var stableURL, ISSN; for(i in lines) { if(lines[i].substring(0,3) == "<1>") { haveStarted = true; } else if(newItemRe.test(lines[i])) { if(!stableURL) { if(ISSN) { stableURL = "http://www.jstor.org/browse/"+ISSN; } else { // Just make sure it''s unique stableURL = k; k++; } } model.addStatement(stableURL, prefixRDF + "type", prefixDummy + "journalArticle", false); for(i in data) { if(data[i].length) { for(j in data[i]) { model.addStatement(stableURL, i, data[i][j]); } } } var data = newDataObject(); delete ISSN; delete stableURL; } else if(lines[i].substring(2, 5) == " : " && haveStarted) { var fieldCode = lines[i].substring(0, 2); var fieldContent = utilities.cleanString(lines[i].substring(5)) if(fieldCode == "TI") { data[prefixDC + "title"].push(fieldContent); } else if(fieldCode == "AU") { var authors = fieldContent.split(";"); for(j in authors) { var author = authors[j]; if(author) { var splitNames = author.split('', ''); if(splitNames) { author = splitNames[1]+'' ''+splitNames[0]; } data[prefixDC + "creator"].push(author); } } } else if(fieldCode == "SO") { data[prefixDummy + "publication"].push(fieldContent); } else if(fieldCode == "VO") { data[prefixDummy + "volume"].push(fieldContent); } else if(fieldCode == "NO") { data[prefixDummy + "number"].push(fieldContent); } else if(fieldCode == "SE") { data[prefixDummy + "series"].push(fieldContent); } else if(fieldCode == "DA") { var date = new Date(fieldContent.replace(".", "")); if(isNaN(date.valueOf())) { data[prefixDC + "date"].push(fieldContent); } else { data[prefixDC + "date"].push(utilities.dateToISO(date)); } } else if(fieldCode == "PP") { data[prefixDummy + "pages"].push(fieldContent); } else if(fieldCode == "EI") { stableURL = fieldContent; } else if(fieldCode == "IN") { data[prefixDC + "identifier"].push("ISSN "+fieldContent); ISSN = fieldContent; } else if(fieldCode == "PB") { data[prefixDC + "publisher"].push(fieldContent); } } } // Loop through again so that we can add with the stableURL if(!stableURL) { if(ISSN) { stableURL = "http://www.jstor.org/browse/"+ISSN; } else { // Just make sure it''s unique stableURL = k; k++; } } model.addStatement(stableURL, prefixRDF + "type", prefixDummy + "journalArticle", false); for(i in data) { if(data[i].length) { for(j in data[i]) { model.addStatement(stableURL, i, data[i][j]); } } } done(); }); }, function() {}); }); wait();'); REPLACE INTO "translators" VALUES ('e85a3134-8c1a-8644-6926-584c8565f23e', '2006-06-26 16:01:00', 3, 'History Cooperative Scraper', 'Simon Kornblith', '^http://www\.historycooperative\.org/(?:journals/.+/.+/.+\.html$|cgi-bin/search.cgi)', 'if(doc.title == "History Cooperative: Search Results") { return "multiple"; } else { return "journalArticle"; }', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; function associateMeta(uri, metaTags, field, rdfUri) { var field = metaTags.namedItem(field); if(field) { model.addStatement(uri, rdfUri, field.getAttribute("content"), false); } } function scrape(doc) { var uri = doc.location.href; var month, year; var metaTags = doc.getElementsByTagName("meta"); associateMeta(uri, metaTags, "Title", prefixDC + "title"); associateMeta(uri, metaTags, "Journal", prefixDummy + "publication"); associateMeta(uri, metaTags, "Volume", prefixDummy + "volume"); associateMeta(uri, metaTags, "Issue", prefixDummy + "number"); var author = metaTags.namedItem("Author"); if(author) { var authors = author.getAttribute("content").split(" and "); for(j in authors) { model.addStatement(uri, prefixDC + "creator", authors[j], false); } } var month = metaTags.namedItem("PublicationMonth"); var year = metaTags.namedItem("PublicationYear"); if(month && year) { model.addStatement(uri, prefixDC + "date", month.getAttribute("content")+" "+year.getAttribute("content"), false); } model.addStatement(uri, prefixRDF + "type", prefixDummy + "journalArticle", false); } if(doc.title == "History Cooperative: Search Results") { var items = utilities.getItemArray(doc, doc, ''^http://[^/]+/journals/.+/.+/.+\.html$''); items = utilities.selectItems(items); if(!items) { return true; } var uris = new Array(); for(i in items) { uris.push(i); } utilities.processDocuments(browser, null, uris, function(browser) { scrape(browser.contentDocument) }, function() { done(); }, function() {}); wait(); } else { scrape(doc); }'); REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006-06-28 22:52:00', 3, 'InnoPAC Scraper', 'Simon Kornblith', '^http://[^/]+/(?:search/|record=)', '// First, check to see if the URL alone reveals InnoPAC, since some sites don''t reveal the MARC button var matchRegexp = new RegExp(''^(http://[^/]+/search/[^/]+/[^/]+/1\%2C[^/]+/)frameset(.+)$''); if(matchRegexp.test(doc.location.href)) { return "book"; } // Next, look for the MARC button var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; var xpath = ''//a[img[@alt="MARC Display"]]''; var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); if(elmts.length) { return "book"; } // Also, check for links to an item display page var tags = doc.getElementsByTagName("a"); for(i=0; i= 0) { model.addStatement(uri, prefixRDF + "type", prefixDummy + "magazineArticle", false); } else if(value.indexOf("newspaper") >= 0) { model.addStatement(uri, prefixRDF + "type", prefixDummy + "newspaperArticle", false); } else { model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false); } } } else if(field == "isbn" || field == "issn" || field == "issn/isbn") { var value = utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver); if(value) { var type; value = utilities.superCleanString(value.nodeValue); if(value.length == 10 || value.length == 13) { type = "ISBN"; } else if(value.length == 8) { type = "ISSN"; } if(type) { model.addStatement(uri, prefixDC + "identifier", type+" "+value, false); } } } } } if(doc.title == "Results") { var items = new Object(); // Require link to match this var tagRegexp = new RegExp(); tagRegexp.compile(''^http://[^/]+/pqdweb\\?((?:.*&)?did=.*&Fmt=[12]|(?:.*&)Fmt=[12].*&did=)''); var tableRows = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr[@class="rowUnMarked"]/td[3][@class="textMedium"]'', nsResolver); // Go through table rows for(var i=0; i]*>/gi); model.addStatement(uri, prefixDummy + "publication", elementParts[elementParts.length-1], true); var dateRegexp = /]*>(?:)?([A-Z][a-z]+)(?:<\/b>)? ([0-9]+, [0-9]{4})/; var m = dateRegexp.exec(centerElements[centerElements.length-1].innerHTML); if(m) { var jsDate = new Date(m[1]+" "+m[2]); model.addStatement(uri, prefixDC + "date", utilities.dateToISO(jsDate), true); } else { var elementParts = centerElements[centerElements.length-1].innerHTML.split(/]*>/gi); model.addStatement(uri, prefixDC + "date", elementParts[1], true); } var cutIndex = citationDataDiv.innerHTML.indexOf("BODY:"); if(cutIndex < 0) { cutIndex = citationDataDiv.innerHTML.indexOf("TEXT:"); } if(cutIndex > 0) { citationData = citationDataDiv.innerHTML.substring(0, cutIndex); } else { citationData = citationDataDiv.innerHTML; } citationData = utilities.cleanTags(citationData); var headlineRegexp = /\n(?:HEADLINE|TITLE|ARTICLE): ([^\n]+)\n/; var m = headlineRegexp.exec(citationData); if(m) { model.addStatement(uri, prefixDC + "title", utilities.cleanTags(m[1]), true); } var bylineRegexp = /\nBYLINE: *(\w[\w\- ]+)/; var m = bylineRegexp.exec(citationData); if(m) { if(m[1].substring(0, 3).toLowerCase() == "by ") { m[1] = m[1].substring(3); } model.addStatement(uri, prefixDC + "creator", m[1], true); model.addStatement(uri, prefixRDF + "type", prefixDummy + "newspaperArticle", false); } else { model.addStatement(uri, prefixRDF + "type", prefixDummy + "journalArticle", false); } var authorRegexp = /\n(?:AUTHOR|NAME): ([^\n]+)\n/; var m = authorRegexp.exec(citationData); if(m) { var authors = m[1].split(/, (?:and )?/); for(i in authors) { model.addStatement(uri, prefixDC + "creator", authors[i].replace(" *", ""), true); } } } var detailRe = new RegExp("^http://[^/]+/universe/document"); if(detailRe.test(doc.location.href)) { scrape(doc); } else { var items = utilities.getItemArray(doc, doc, "^http://[^/]+/universe/document"); items = utilities.selectItems(items); if(!items) { return true; } var uris = new Array(); for(i in items) { uris.push(i); } utilities.processDocuments(browser, null, uris, function(browser) { scrape(browser.contentDocument) }, function() { done(); }, function() {}); wait(); }'); REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006-06-26 16:01:00', 3, 'Aleph Scraper', 'Simon Kornblith', '^http://[^/]+/F(?:/[A-Z0-9\-]+(?:\?.*)?$|\?func=find)', 'var singleRe = new RegExp("^http://[^/]+/F/[A-Z0-9\-]+\?.*func=full-set-set.*\&format=[0-9]{3}"); if(singleRe.test(doc.location.href)) { return "book"; } else { var tags = doc.getElementsByTagName("a"); for(var i=0; i 3) { var ind1 = field.charAt(3); if(field.length > 4) { var ind2 = field.charAt(4); } } record.add_field(code, ind1, ind2, value); } } utilities.importMARCRecord(record, uri, model); }, function() { done(); }, function() {}); wait();'); REPLACE INTO "translators" VALUES ('774d7dc2-3474-2684-392c-f787789ec63d', '2006-06-26 16:01:00', 3, 'Dynix Scraper', 'Simon Kornblith', 'ipac\.jsp\?.*(?:uri=full=[0-9]|menu=search)', 'var detailsRe = new RegExp(''ipac\.jsp\?.*uri=full=[0-9]''); if(detailsRe.test(doc.location.href)) { return "book"; } else { return "multiple"; }', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; var uri = doc.location.href; var detailsRe = new RegExp(''ipac\.jsp\?.*uri=full=[0-9]''); var uris = new Array(); if(detailsRe.test(uri)) { uris.push(uri+''&fullmarc=true''); } else { var items = utilities.getItemArray(doc, doc, "ipac\.jsp\?.*uri=full=[0-9]|^javascript:buildNewList\\(''.*uri%3Dfull%3D[0-9]"); items = utilities.selectItems(items); if(!items) { return true; } var buildNewList = new RegExp("^javascript:buildNewList\\(''([^'']+)"); var uris = new Array(); for(i in items) { var m = buildNewList.exec(i); if(m) { uris.push(unescape(m[1]+''&fullmarc=true'')); } else { uris.push(i+''&fullmarc=true''); } } } utilities.processDocuments(browser, null, uris, function(newBrowser) { var newDoc = newBrowser.contentDocument; var uri = newDoc.location.href; var namespace = newDoc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; var xpath = ''//form/table[@class="tableBackground"]/tbody/tr/td/table[@class="tableBackground"]/tbody/tr[td[1]/a[@class="normalBlackFont1"]]''; var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver); var record = new MARC_Record(); for(var i=0; i 0) { return "multiple"; } else { return "book"; }', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; var checkItems = false; if(doc.location.href.indexOf("/authority_hits") > 0) { var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; checkItems = utilities.gatherElementsOnXPath(doc, doc, "/html/body//ol/li", nsResolver); } if(checkItems && checkItems.length) { var items = utilities.getItemArray(doc, checkItems, ''https?://.*/web2/tramp2\.exe/see_record''); items = utilities.selectItems(items); if(!items) { return true; } var uris = new Array(); for(i in items) { uris.push(i); } } else { var uris = new Array(doc.location.href); } for(i in uris) { var uri = uris[i]; var uriRegexp = /^(https?:\/\/.*\/web2\/tramp2\.exe\/)(?:goto|see\_record|authority\_hits)(\/.*)\?(?:screen=Record\.html\&)?(.*)$/i; var m = uriRegexp.exec(uri); if(uri.indexOf("/authority_hits") < 0) { var newUri = m[1]+"download_record"+m[2]+"/RECORD.MRC?format=marc&"+m[3]; } else { var newUri = m[1]+"download_record"+m[2]+"/RECORD.MRC?format=marc"; } // Keep track of how many requests have been completed var j = 0; utilities.HTTPUtilities.doGet(newUri, null, function(text) { var record = new MARC_Record(); record.load(text, "binary"); utilities.importMARCRecord(record, uris[j], model); j++; if(j == uris.length) { done(); } }); } wait();'); REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006-06-26 16:01:00', 3, 'GEAC Scraper', 'Simon Kornblith', '/(?:GeacQUERY|(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html))', 'if(doc.location.href.indexOf("/GeacQUERY") > 0) { return "multiple"; } else { return "book"; }', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; var uri = doc.location.href; var uris = new Array(); if(uri.indexOf("/GeacQUERY") > 0) { var items = utilities.getItemArray(doc, doc, "(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html)"); items = utilities.selectItems(items); if(!items) { return true; } var uris = new Array(); for(i in items) { var newUri = i.replace(/([:&])next=html\/geacnffull.html/, "$1next=html/marc.html"); newUri = newUri.replace(/([:&])next=html\/record.html/, "$1next=html/marc.html"); uris.push(newUri); } } else { var newUri = uri.replace(/([:&])next=html\/geacnffull.html/, "$1next=html/marc.html"); newUri = newUri.replace(/([:&])next=html\/record.html/, "$1next=html/marc.html"); uris.push(newUri); } utilities.processDocuments(browser, null, uris, function(newBrowser) { var newDoc = newBrowser.contentDocument; var uri = newDoc.location.href; var namespace = newDoc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; var record = new MARC_Record(); var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, ''//pre/text()'', nsResolver); var tag, ind1, ind2, content; for(var i=0; i 10) { ind1 = line.substring(4, 5); ind2 = line.substring(5, 6); content = line.substring(7); content = content.replace(/\$([a-z])(?: |$)/g, record.subfield_delimiter+"$1"); } else { ind1 = ""; ind2 = ""; content = line.substring(4); } } utilities.importMARCRecord(record, uri, model); }, function() { done(); }, function() {}); wait();'); REPLACE INTO "translators" VALUES ('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006-06-26 16:01:00', 3, 'SIRSI -2003 Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi', 'var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; var elmts = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/p/text()[1]'', nsResolver); for(i in elmts) { if(utilities.superCleanString(elmts[i].nodeValue) == "Viewing record") { return "book"; } } var xpath = ''//form[@name="hitlist"]/table/tbody/tr''; var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); if(elmts.length) { return "multiple"; } return false;', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; // Cheap hack to convert HTML entities function unescapeHTML(text) { var div = doc.createElement("div"); div.innerHTML = utilities.cleanTags(text); var text = div.childNodes[0] ? div.childNodes[0].nodeValue : null; delete div; return text; } var uri = doc.location.href; var recNumbers = new Array(); var xpath = ''//form[@name="hitlist"]/table/tbody/tr''; var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); if(elmts.length) { // Search results page var uriRegexp = /^http:\/\/[^\/]+/; var m = uriRegexp.exec(uri); var postAction = doc.forms.namedItem("hitlist").getAttribute("action"); var newUri = m[0]+postAction.substr(0, postAction.length-1)+"40" var titleRe = /
\s*(.*[^\s])\s*
/i; var items = new Array(); for(var i=0; i"); texts = texts[1].split(""); text = unescapeHTML(texts[0]); var documents = text.split("*** DOCUMENT BOUNDARY ***"); for(var j=1; j 10) { ind1 = line.substr(6, 1); ind2 = line.substr(7, 1); content = line.substr(8); } else { ind1 = ""; ind2 = ""; content = line.substring(6); } } utilities.importMARCRecord(record, uri, model); } done(); }); wait();'); REPLACE INTO "translators" VALUES ('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006-06-26 16:01:00', 3, 'TLC/YouSeeMore Scraper', 'Simon Kornblith', 'TLCScripts/interpac\.dll\?(?:.*LabelDisplay.*RecordNumber=[0-9]|Search|ItemTitles)', 'var detailRe = new RegExp("TLCScripts/interpac\.dll\?.*LabelDisplay.*RecordNumber=[0-9]"); if(detailRe.test(doc.location.href)) { return "book"; } else { return "multiple"; }', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; var detailRe = new RegExp("TLCScripts/interpac\.dll\?.*LabelDisplay.*RecordNumber=[0-9]"); var uri = doc.location.href; var newUris = new Array(); if(detailRe.test(uri)) { newUris.push(uri.replace("LabelDisplay", "MARCDisplay")); } else { var items = utilities.getItemArray(doc, doc, ''TLCScripts/interpac\.dll\?.*LabelDisplay.*RecordNumber=[0-9]''); items = utilities.selectItems(items); if(!items) { return true; } for(i in items) { newUris.push(i.replace("LabelDisplay", "MARCDisplay")); } } utilities.processDocuments(browser, null, newUris, function(newBrowser) { var newDoc = newBrowser.contentDocument; var uri = newDoc.location.href; var namespace = newDoc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; var record = new MARC_Record(); var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, ''/html/body/table/tbody/tr[td[4]]'', nsResolver); var tag, ind1, ind2, content; for(var i=0; i 1) { var data = newDataObject(); for(i in lines) { var fieldCode = lines[i].substring(0, 2); var fieldContent = utilities.cleanString(lines[i].substring(6)) if(fieldCode == "T1") { data[prefixDC + "title"].push(fieldContent); } else if(fieldCode == "A1") { var authors = fieldContent.split(";"); for(j in authors) { var author = authors[j]; if(author) { var splitNames = author.split('', ''); if(splitNames) { author = splitNames[1]+'' ''+splitNames[0]; } data[prefixDC + "creator"].push(author); } } } else if(fieldCode == "JF") { data[prefixDummy + "publication"].push(fieldContent); } else if(fieldCode == "VL") { data[prefixDummy + "volume"].push(fieldContent); } else if(fieldCode == "IS") { data[prefixDummy + "number"].push(fieldContent); } else if(fieldCode == "Y1") { data[prefixDC + "year"].push(fieldContent); } else if(fieldCode == "PP") { data[prefixDummy + "pages"].push(fieldContent); } else if(fieldCode == "UR") { stableURL = fieldContent; } else if(fieldCode == "SN") { data[prefixDC + "identifier"].push("ISSN "+fieldContent); ISSN = fieldContent; } else if(fieldCode == "PB") { data[prefixDC + "publisher"].push(fieldContent); } } model.addStatement(stableURL, prefixRDF + "type", prefixDummy + "journalArticle", false); for(i in data) { if(data[i].length) { for(j in data[i]) { model.addStatement(stableURL, i, data[i][j]); } } } } } done(); }, function() {}); }, function() {}); wait(); } else { var uri = doc.location.href; var elmts = utilities.gatherElementsOnXPath(doc, doc, ''//comment()'', nsResolver); for(i in elmts) { if(elmts[i].nodeValue.substr(0, 10) == "HeaderData") { var headerRegexp = /HeaderData((?:.|\n)*)\#\#EndHeaders/i var m = headerRegexp.exec(elmts[i].nodeValue); var headerData = m[1]; } } // Use E4X rather than DOM/XPath, because the Mozilla gods have decided not to // expose DOM/XPath to sandboxed scripts var newDOM = new XML(headerData); function mapRDF(text, rdfUri) { if(text) { model.addStatement(uri, rdfUri, text, true); } } mapRDF(newDOM.journal.text(), prefixDummy + "publication"); mapRDF(newDOM.volume.text(), prefixDummy + "volume"); mapRDF(newDOM.issue.text(), prefixDummy + "number"); mapRDF(newDOM.year.text(), prefixDummy + "year"); mapRDF(newDOM.pubdate.text(), prefixDC + "date"); mapRDF(newDOM.doctitle.text(), prefixDC + "title"); // Do ISSN var issn = newDOM.issn.text(); if(issn) { model.addStatement(uri, prefixDC + "identifier", "ISSN "+issn.replace(/[^0-9]/g, ""), true); } // Do pages var fpage = newDOM.fpage.text(); var lpage = newDOM.lpage.text(); if(fpage != "") { var pages = fpage; if(lpage) { pages += "-"+lpage; } model.addStatement(uri, prefixDummy + "pages", pages, true); } // Do authors var elmts = newDOM.docauthor; for(i in elmts) { var fname = elmts[i].fname.text(); var surname = elmts[i].surname.text(); model.addStatement(uri, prefixDC + "creator", fname+" "+surname, true); } model.addStatement(uri, prefixRDF + "type", prefixDummy + "journalArticle", false); }'); REPLACE INTO "translators" VALUES ('fcf41bed-0cbc-3704-85c7-8062a0068a7a', '2006-06-26 16:01:00', 3, 'PubMed Scraper', 'Simon Kornblith', '^http://www\.ncbi\.nlm\.nih\.gov/entrez/query\.fcgi\?(?:.*db=PubMed.*list_uids=[0-9]|.*list_uids=[0-9].*db=PubMed|.*db=PubMed.*CMD=search|.*CMD=search.*db=PubMed)', 'if(doc.location.href.indexOf("list_uids=") >= 0) { return "journalArticle"; } else { return "multiple"; }', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; function mapRDF(uri, text, rdfUri) { if(text != "") { model.addStatement(uri, rdfUri, text, true); } } var uri = doc.location.href; var ids = new Array(); var idRegexp = /[\?\&]list_uids=([0-9\,]+)/; var m = idRegexp.exec(uri); if(m) { ids.push(m[1]); } else { var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; var items = new Array(); var tableRows = utilities.gatherElementsOnXPath(doc, doc, ''//div[@class="ResultSet"]/table/tbody'', nsResolver); // Go through table rows for(var i=0; i]*>/, "").replace(/<\?xml[^>]*\?>/, ""); var xml = new XML(text); for(var i=0; i; for(var i in items) { var item = items[i]; var isPartialItem = false; if(utilities.inArray(item.itemType, partialItemTypes)) { isPartialItem = true; } var mods = ; mods.@ID = item.itemID; /** CORE FIELDS **/ // XML tag titleInfo; object field title mods.titleInfo.title = item.title; // XML tag typeOfResource/genre; object field type var modsType, marcGenre; if(item.itemType == "book" || item.itemType == "bookSection") { modsType = "text"; marcGenre = "book"; } else if(item.itemType == "journalArticle" || item.itemType == "magazineArticle") { modsType = "text"; marcGenre = "periodical"; } else if(item.itemType == "newspaperArticle") { modsType = "text"; marcGenre = "newspaper"; } else if(item.itemType == "thesis") { modsType = "text"; marcGenre = "theses"; } else if(item.itemType == "letter") { modsType = "text"; marcGenre = "letter"; } else if(item.itemType == "manuscript") { modsType = "text"; modsType.@manuscript = "yes"; } else if(item.itemType == "interview") { modsType = "text"; modsType.@manuscript = "interview"; } else if(item.itemType == "film") { modsType = "moving image"; marcGenre = "motion picture"; } else if(item.itemType == "artwork") { modsType = "still image"; marcGenre = "art original"; } else if(item.itemType == "website") { modsType = "multimedia"; marcGenre = "web site"; } mods.typeOfResource = modsType; mods.genre += {item.itemType}; mods.genre += {marcGenre}; // XML tag genre; object field thesisType, type if(item.thesisType) { mods.genre += {item.thesisType}; } if(item.type) { mods.genre += {item.type}; } // XML tag name; object field creators for(var j in item.creators) { var roleTerm = ""; if(item.creators[j].creatorType == "author") { roleTerm = "aut"; } else if(item.creators[j].creatorType == "editor") { roleTerm = "edt"; } else if(item.creators[j].creatorType == "creator") { roleTerm = "ctb"; } // FIXME - currently all names are personal mods.name += {item.creators[j].lastName} {item.creators[j].firstName} {roleTerm} ; } // XML tag recordInfo.recordOrigin; used to store our generator note mods.recordInfo.recordOrigin = "Scholar for Firefox "+utilities.getVersion(); /** FIELDS ON NEARLY EVERYTHING BUT NOT A PART OF THE CORE **/ // XML tag recordInfo.recordContentSource; object field source if(item.source) { mods.recordInfo.recordContentSource = item.source; } // XML tag recordInfo.recordIdentifier; object field accessionNumber if(item.accessionNumber) { mods.recordInfo.recordIdentifier = item.accessionNumber; } // XML tag accessCondition; object field rights if(item.rights) { mods.accessCondition = item.rights; } /** SUPPLEMENTAL FIELDS **/ // XML tag relatedItem.titleInfo; object field series if(item.series) { var series = {item.series} ; if(item.itemType == "bookSection") { // For a book section, series info must go inside host tag mods.relatedItem.relatedItem = series; } else { mods.relatedItem += series; } } // Make part its own tag so we can figure out where it goes later var part = new XML(); // XML tag detail; object field volume if(item.volume) { if(utilities.isInt(item.volume)) { part += {item.volume}; } else { part += {item.volume}; } } // XML tag detail; object field number if(item.number) { if(utilities.isInt(item.number)) { part += {item.number}; } else { part += {item.number}; } } // XML tag detail; object field section if(item.section) { if(utilities.isInt(item.section)) { part += {item.section}; } else { part += {item.section}; } } // XML tag detail; object field pages if(item.pages) { var start, end; if(typeof(item.pages) == "string" && item.pages.indexOf("-")) { // A page range var pageNumbers = item.pages.split("-"); start = pageNumbers[0]; end = pageNumbers[1]; } else { // Assume start and end are the same start = item.pages; end = item.pages; } part += {start}{end}; } // Assign part if something was assigned if(part.length() != 1) { if(isPartialItem) { // For a journal article, bookSection, etc., the part is the host mods.relatedItem.part += {part}; } else { mods.part += {part}; } } // XML tag originInfo; object fields edition, place, publisher, year, date var originInfo = new XML(); if(item.edition) { originInfo += {item.edition}; } if(item.place) { originInfo += {item.place}; } if(item.publisher) { originInfo += item.publisher; } else if(item.distributor) { originInfo += item.distributor; } if(item.year) { // Assume year is copyright date originInfo += {item.year}; } if(item.date) { if(inArray(item.itemType, ["magazineArticle", "newspaperArticle"])) { // Assume date is date issued var dateType = "dateIssued"; } else { // Assume date is date created var dateType = "dateCreated"; } originInfo += <{dateType} encoding="iso8601">{item.date}; } if(originInfo.length() != 1) { if(isPartialItem) { // For a journal article, bookSection, etc., this goes under the host mods.relatedItem.originInfo += {originInfo}; } else { mods.originInfo += {originInfo}; } } // XML tag identifier; object fields ISBN, ISSN var identifier = null; if(item.ISBN) { identifier = {item.ISBN}; } else if(item.ISSN) { identifier = {item.ISSN}; } if(identifier) { if(isPartialItem) { mods.relatedItem.identifier = identifier; } else { mods.identifier = identifier; } } // XML tag relatedItem.titleInfo; object field publication if(item.publication) { mods.relatedItem.titleInfo += {item.publication}; } // XML tag classification; object field callNumber if(item.callNumber) { mods.classification = item.callNumber; } // XML tag location.physicalLocation; object field archiveLocation if(item.archiveLocation) { mods.location.physicalLocation = item.archiveLocation; } // XML tag location.url; object field archiveLocation if(item.url) { mods.location.url = item.url; } if(mods.relatedItem.length() == 1 && isPartialItem) { mods.relatedItem.@type = "host"; } /** NOTES **/ for(var j in item.notes) { mods.note += {item.notes[j].note}; } modsCollection.mods += mods; } write(modsCollection.toString()); }');