From f6b0d9a541d648e22a08aea37146562c86e8b6f3 Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Sun, 25 Jun 2006 22:00:20 +0000 Subject: [PATCH] search results scraping for InfoTrac. closes #15 --- scrapers.sql | 145 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 97 insertions(+), 48 deletions(-) diff --git a/scrapers.sql b/scrapers.sql index 0cf7e3607..79ea42e1c 100644 --- a/scrapers.sql +++ b/scrapers.sql @@ -1131,7 +1131,7 @@ if(doc.title == "Results") { }'); REPLACE INTO "scrapers" VALUES('6773a9af-5375-3224-d148-d32793884dec', '2006-06-18 11:19:00', 'InfoTrac Scraper', 'Simon Kornblith', '^http://infotrac-college\.thomsonlearning\.com/itw/infomark/', -'if(doc.title.substring(0, 8) == "Article ") { +'if(doc.title.substring(0, 8) == "Article " || doc.title.substring(0, 10) == "Citations ") { return true; } return false;', @@ -1145,60 +1145,109 @@ var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; -var uri = doc.location.href; - -var xpath = ''/html/body//comment()''; -var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); -for (var i = 0; i < elmts.length; i++) { - var elmt = elmts[i]; - var colon = elmt.nodeValue.indexOf(":"); - var field = elmt.nodeValue.substring(1, colon).toLowerCase(); - var value = elmt.nodeValue.substring(colon+1, elmt.nodeValue.length-1); - if(field == "title") { - model.addStatement(uri, prefixDC + "title", value, false); - } else if(field == "journal") { - model.addStatement(uri, prefixDummy + "publication", value, false); - } else if(field == "pi") { - parts = value.split(" "); - var date = ""; - var isDate = true; - var rdfUri; - for(j in parts) { - firstChar = parts[j].substring(0, 1); - rdfUri = false; - - if(firstChar == "v") { - rdfUri = prefixDummy + "volume"; - } else if(firstChar == "i") { - rdfUri = prefixDummy + "number"; - } else if(firstChar == "p") { - rdfUri = prefixDummy + "pages"; - var pagesRegexp = /p(\w+)\((\w+)\)/; - var match = pagesRegexp.exec(parts[j]); - if(match) { - var finalPage = parseInt(match[1])+parseInt(match[2]) - parts[j] = "p"+match[1]+"-"+finalPage.toString(); +function extractCitation(uri, elmts, title) { + if(title) { + model.addStatement(uri, prefixDC + "title", utilities.superCleanString(title), true); + } + for (var i = 0; i < elmts.length; i++) { + var elmt = elmts[i]; + var colon = elmt.nodeValue.indexOf(":"); + var field = elmt.nodeValue.substring(1, colon).toLowerCase(); + var value = elmt.nodeValue.substring(colon+1, elmt.nodeValue.length-1); + if(field == "title") { + model.addStatement(uri, prefixDC + "title", utilities.superCleanString(value), true); + } else if(field == "journal") { + model.addStatement(uri, prefixDummy + "publication", value, true); + } else if(field == "pi") { + parts = value.split(" "); + var date = ""; + var isDate = true; + var rdfUri, type; + for(j in parts) { + firstChar = parts[j].substring(0, 1); + rdfUri = false; + + if(firstChar == "v") { + rdfUri = prefixDummy + "volume"; + type = prefixDummy + "journalArticle"; + } else if(firstChar == "i") { + rdfUri = prefixDummy + "number"; + } else if(firstChar == "p") { + rdfUri = prefixDummy + "pages"; + var pagesRegexp = /p(\w+)\((\w+)\)/; + var match = pagesRegexp.exec(parts[j]); + if(match) { + var finalPage = parseInt(match[1])+parseInt(match[2]) + parts[j] = "p"+match[1]+"-"+finalPage.toString(); + } else if(!type) { + var justPageNumber = parts[j].substr(1); + if(parseInt(justPageNumber).toString() != justPageNumber) { + type = prefixDummy + "newspaperArticle"; + } + } + } + + if(rdfUri) { + isDate = false; + if(parts[j] != "pNA") { // not a real page number + var content = parts[j].substring(1); + model.addStatement(uri, rdfUri, content, false); + } else if(!type) { + type = prefixDummy + "newspaperArticle"; + } + } else if(isDate) { + date += " "+parts[j]; } } - if(rdfUri) { - isDate = false; - if(parts[j] != "pNA") { // not a real page number - var content = parts[j].substring(1); - model.addStatement(uri, rdfUri, content, true); - } - } else if(isDate) { - date += " "+parts[j]; + // Set type + if(!type) { + type = prefixDummy + "magazineArticle"; } + model.addStatement(uri, prefixRDF + "type", type, false); + + if(date != "") { + model.addStatement(uri, prefixDC + "date", date.substring(1), true); + } + } else if(field == "author") { + model.addStatement(uri, prefixDC + "creator", utilities.cleanAuthor(value), true); } - if(date != "") { - model.addStatement(uri, prefixDC + "date", date.substring(1), false); - } - } else if(field == "author") { - model.addStatement(uri, prefixDC + "creator", utilities.cleanAuthor(value), false); } } -model.addStatement(uri, prefixRDF + "type", prefixDummy + "journalArticle", false);'); + + +var uri = doc.location.href; +if(doc.title.substring(0, 8) == "Article ") { + var xpath = ''/html/body//comment()''; + var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); + extractCitation(uri, elmts); +} else { + var items = new Array(); + var uris = new Array(); + var tableRows = utilities.gatherElementsOnXPath(doc, doc, ''/html/body//table/tbody/tr/td[a/b]'', nsResolver); + // Go through table rows + for(var i=0; i