#!/usr/local/bin/awk -f @include "init.awk" @include "library.awk" BEGIN { # ################################################################################################ # # Internet Archive Search (ias). Adapted from iacs (Internet Archive Classic Search) # Stephen Balbach October 2015 # # Pass variable: # # -v id="/home/adminuser/wi-awb/temp/wi-awb-1026211441/" # -v id="/home/adminuser/wi-awb/temp/wi-awb-1031200119/" # # Optionally pass: # # -v cache="delete" # # to delete the iascache directory when done (ie. when running via wi.csh) # # The purpose of this program is to search each book's metadata from a particular IA search # and determine if it belongs to the intended author. It will search the first 50 results # or whatever "Rows" is set to. It returns the number of books that it things match that author # in the file ias.result -- see talgo.awk for algorithmic processing to determine best search choice. # # # ____________________________________________________________________________________________ # # Configuration variables: # # 1. Program filename - the name of the script. Plus other hard coded paths MyProg = "ias.awk" TDir = id # 5. Number of results per page (50 was the IA default). Can by anything based on # your computer speed and memory etc.. Rows = 500 # # 7. URL Agent string - this is what Internet Archive sees in their logs. Recommend # setting the name of the program and your contact information, such as an email # or your Internet Archive userid. Agent = "IAS.AWK: Wikipedia<->IA Project (iacs2015@nym.hush.com)" # # Maximum year a book can be published. ie. if a book publication year is beyond this then reject it as garbage data # MaxYear = 2030 # # ################################################################################################## delete G main() if( checkexists(TDir "iascache") && cache ~ /^delete$/ ) # Remove cached files save diskspace sys2var( Exe["rm"] " -r -- " TDir "iascache") } function main( s,c,a,b,k,kname,mname,fulln,ktype,khits,m,n,o,d,count,debug,debugid,dest) { debug = 0 # 1=on, 0=off debugid = "tmpO@Alan_Maxwell@t@2" # Specified in this format WPArticle = clean( readfile(TDir "article.txt") ) delete Result delete Final delete Scope s = readfile(TDir "ia-results") # Don't clean() - retain original c = split(s, a, "\n") while (i++ < c) { if(a[i] != "") { # remove blank lines split(a[i], b, "|") if(b[2] != 0) # remove 0 hit results k[b[3]][b[1]][b[2]] = 1 # name|type|hits } } if(! checkexists(TDir "iascache") ) sys2var( Exe["mkdir"] " " TDir "iascache") checkexists(TDir "iascache", "ias.awk main()", "exit") for(kname in k) { m[kname] = kname } for(mname in m) { #print "mname = " mname >> "bah" for(kname in k) { #print "kname = " kname >> "bah" if( kname ~ "^" mname "$" ) { #print "MATCHED" >> "bah" for(ktype in k[kname]) { #print "ktype = " ktype >> "bah" for(khits in k[kname][ktype]) { #print "khits = " khits >> "bah" if(ktype == "none") n[4] = "tmpO@" gensub(" ","_","g",kname) "@" ktype "@" if(ktype == "w") n[3] = "tmpO@" gensub(" ","_","g",kname) "@" ktype "@" if(ktype == "t") n[2] = "tmpO@" gensub(" ","_","g",kname) "@" ktype "@" if(ktype == "tx") n[1] = "tmpO@" gensub(" ","_","g",kname) "@" ktype "@" fulln = kname } } } } PROCINFO["sorted_in"] = "@ind_num_asc" for(o in n) { # print "n[o] = " n[o] c = 0 while(c++ < 20) { if( checkexists(TDir n[o] c) ) break } s = readfile(TDir n[o] c) # Don't clean() - retain original print "________________________________________________________________________________________________________________________________" print n[o] c print "________________________________________________________________________________________________________________________________" split(gensub("[[]http://archive.org/search.php[?]query=","","g",s),a," ") d = split(mname,b," ") count = 0 if(b[1] !~ /^Sir$/) { # Skip names that begin with "Sir Whatever" as they produce odd results if(debug) { re = n[o] c if(debugid ~ re) count = search_ia(a[1],1,"",b[1],b[d],fulln,n[o] c) } else { count = search_ia(a[1],1,"",b[1],b[d],fulln,n[o] c) } } else { count-- Result[n[o] c]["realcount"] = -1 Result[n[o] c]["numfound"] = -1 } print " |1-----|" print count Result[n[o] c]["count"] = count } delete n } # Create ias.result (list of possibles) if(checkexists(TDir "ias.result")) { close(TDir "ias.result") sys2var( Exe["rm"] " " TDir "ias.result") } asorti(Result, dest) if(length(dest)) { for(o in dest) print dest[o] " | " Result[dest[o]]["count"] " | " Result[dest[o]]["realcount"] " | " Result[dest[o]]["numfound"] >> TDir "ias.result" close(TDir "ias.result") } # Create ias.scope (list of metadata) if(checkexists(TDir "ias.scope")) sys2var( Exe["rm"] " " TDir "ias.scope") for(o in Scope) { for(oo in Scope[o]) print o "|" oo "|" Scope[o][oo]["mediatype"] "|" Scope[o][oo]["date"] "|" Scope[o][oo]["jstor"] >> TDir "ias.scope" } close(TDir "ias.scope") print " |2-----|" # Run talgo.awk algorithm to find best choice. Load from ias.result and save to ias.out - "-s" save to ia9out-scope if(length(dest)) sys2var( Exe["talgo"] " -s -l " TDir "ias.result") print strip( readfile(TDir "ias.result") ) print "Final: " strip( readfile(TDir "ias.out") ) } function copyarray(result, final, name) { final["name"] = name final["count"] = Result[name]["count"] final["realcount"] = Result[name]["realcount"] final["numfound"] = Result[name]["numfound"] } # # Search Internet Archive and format results. # function search_ia(entity, pagenum, sort, firstn, lastn, fulln, idstring, head,tail,url,xml,c,numfound,doc,page,arr,arrg,arrgh,qin,G,i,count,tokenhits) { head = "http://archive.org/advancedsearch.php?q=" tail = "&fl[]=date&fl[]=publisher&fl[]=collection&fl[]=creator&fl[]=description&fl[]=downloads&fl[]=identifier&fl[]=mediatype&fl[]=subject&fl[]=title&sort[]=" sort "&sort[]=&sort[]=&rows=" Rows "&output=xml&callback=callback&save=yes&page=" pagenum url = head entity tail #print url >> "bah" # Download XML from Internet Archive xml = http2var2(url) if ( xml == "" || length(xml) == 0) { print "Error in function search_ia: Unable to retrieve data from Internet Archive." > "/dev/stderr" return -1 } if(length(xml) < 100) { # Bug in Internet Archive. print "Warning: XML bug. Using build_xml()" xml = build_xml(head,tail,entity,Rows,sort,pagenum) } gsub(/</,"<",xml);gsub(/>/,">",xml);gsub(/"/,"\"",xml);gsub(/&/,"\\&",xml) # print xml > "test.xml" # close("test.xml") Result[idstring]["realcount"] = 0 c = split(xml, doc, "(|)") # numFound="432" match(xml, "numFound=\"[0-9]+\"", arr) split(arr[0], arrg, "\"") numfound = arrg[2] Result[idstring]["numfound"] = numfound # name="qin">text< (search string as reported by IA) split(doc[1], arr, "(name=\"qin\">)") split(arr[2], arrg, "") qin = arrg[1] init_searchtokens_full(qin,firstn,lastn) count = 0 i = 0 while(i < (Rows * 2) ) { i = i + 2 delete G # identifier (name="identifier">presidentgarfiel00hinsuoft<) match(doc[i], "name=\"identifier\">[^<]+<", arr) split(arr[0],arrg,"(<|>)") G["identifier"] = arrg[2] if ( arrg[2] == "" ) continue # Skip if no id # creator and subject and collection (if multiple names they are surrounded by , remove those and leave each name surrounded by ) ad = 0 ac = split(doc[i], arr, "\n") # possible point of failure if XML format ever stoped having a /n after each line while(ad++ < ac) { if(arr[ad] ~ // ) { gsub(//,"",arr[ad]) gsub(/<\/arr>/,"",arr[ad]) G["creator"] = strip(arr[ad]) continue } if(arr[ad] ~ // ) { gsub(//,"",arr[ad]) gsub(/<\/str>/,"",arr[ad]) G["creator"] = strip(arr[ad]) continue } if(arr[ad] ~ // ) { gsub(//,"",arr[ad]) gsub(/<\/arr>/,"",arr[ad]) G["subject"] = strip(arr[ad]) continue } if(arr[ad] ~ // ) { gsub(//,"",arr[ad]) gsub(/<\/str>/,"",arr[ad]) G["subject"] = strip(arr[ad]) continue } if(arr[ad] ~ // ) { gsub(//,"",arr[ad]) gsub(/<\/arr>/,"",arr[ad]) G["collection"] = strip(arr[ad]) continue } if(arr[ad] ~ // ) { gsub(//,"",arr[ad]) gsub(/<\/str>/,"",arr[ad]) G["collection"] = strip(arr[ad]) continue } } # description (name="description">text<) split(doc[i], arr, "(") G["description"] = striphtml(substr(arrg[1], 2)) # date (name="date">1881-01-01T00:00:00Z) match(doc[i], "name=\"date\">[^<]+<", arr) split(arr[0],arrg,"(<|>)") split(arrg[2],arrgh,"-") G["date"] = strip(arrgh[1]) # mediatype (name="mediatype">texts<) match(doc[i], "name=\"mediatype\">[^<]+<", arr) split(arr[0],arrg,"(<|>)") G["mediatype"] = arrg[2] # title (name="title">Title<) match(doc[i], "name=\"title\">[^<]+<", arr) split(arr[0],arrg,"(<|>)") G["title"] = arrg[2] currcount = find_searchtokens_full(G, fulln, idstring) count = count + currcount if(currcount > 0) { Result[idstring]["realcount"]++ } if(numfound <= Rows) # Abort early - no chance base = numfound else base = Rows if(count < ( ( (base / 2) * -1) -1) ) return -1001 } return count } function find_searchtokens_full(G, fulln, idstring, token,subentity,count,cc,ci,ca,da,dc,di,dm,re,stopflag,debug,debugid,debugo) { # Enable debug to examine how it scores individual work IDs. Some output saved to file "test" (overwritten with each run) debug = 0 # 1=on, 0=off debugid = "landscapingfordu00giro" debugo = "/home/pepper/wi-awb/debugo" if(G["identifier"] ~ debugid && debug) print "Starting" > debugo dc = split(TokensFull["_date"], da, " ") count = 0 if( searchwikipedia(G["title"]) ) { # If the book title is in the Wikipedia article.. count++ if(G["identifier"] ~ debugid && debug) { print " Matched:" print "+1 in special 1-2 for " subentity >> debugo } print " " G["identifier"] " | title | (wikipedia article) | = " G["title"] } if( length(G["identifier"]) < 1) G["identifier"] = "unknown" Scope[idstring][G["identifier"]]["mediatype"] = G["mediatype"] Scope[idstring][G["identifier"]]["date"] = 0 # 1 if work is matched on a date Scope[idstring][G["identifier"]]["jstor"] = 0 # 1 if work is matched on creator and work is JSTOR for(subentity in G) { if(G["identifier"] ~ debugid && debug) print "--Starting subentity " subentity if(subentity ~ /^collection$/ ) { # Skip collection:opensource_audio works if(G[subentity] ~ /opensource_audio/) { return -1 } } if(subentity ~ /^mediatype$/ ) { # Skip mediatype:software works if(G[subentity] == "software") { return -1 } } if(subentity ~ /^identifier$/ ) { # Skip arxiv works if(G[subentity] ~ /arxiv/) { return -1 } continue # Skip "identifier" subentity } # Remove all "-" (same thing done in init_searchtokens()) G[subentity] = strip(gensub("-"," ","g",G[subentity])) stopflag = 0 tokenflag = 0 for(token in TokensFull) { if(TokensFull[token] == "") continue # Skip the name tokens if 1 has already matched to avoid repeat matches due to regex+agrep if(tokenflag == 1 && substr(token,1,1) !~ /_/ ) continue if(token ~ /^_(lastname|firstname|lastname_special)$/) { if(G["identifier"] ~ debugid && debug) print "Skipping " token " in " subentity >> debugo continue } agrep_debug = 0 if(G["identifier"] ~ debugid && debug) agrep_debug = 1 if(token == "_date" && subentity !~ /^creator$|^subject/) { # See below for creator/subject + date check re = "(" daterange(TokensFull["_birth"]) ").+{1,3}(" daterange(TokensFull["_death"]) ")" if( match(G[subentity], re) && agrep(G[subentity], TokensFull["_lastname"], ".25", agrep_debug, "plain") ) { count++ if(G["identifier"] ~ debugid && debug) { print " Matched:" print "+1 in _date: G[" subentity "]=|" G[subentity] "| TokensFull[" token "]=|" TokensFull[token] "| lastname=|" TokensFull["_lastname"] "|" >> debugo } print " " G["identifier"] " | " subentity " |" TokensFull[token] "| = " G[subentity] " (a)" Scope[idstring][G["identifier"]]["date"] = 1 # 1 if work is matched on a date continue } } if(token == "_firstname_special") { if( agrep(G[subentity], TokensFull["_firstname_special"], ".25", agrep_debug, "plain") && agrep(G[subentity], TokensFull["_lastname_special"], ".25", agrep_debug, "plain") ) { count++ if(G["identifier"] ~ debugid && debug) { print " Matched:" print "+1 in special for " subentity >> debugo } print " " G["identifier"] " | " subentity " |" TokensFull["_firstname_special"] " " TokensFull["_lastname_special"] "| = " G[subentity] " (b)" continue } } # If creator or subject string contains a date, and it doesn't match one of the birth-death dates in TokensFull, then don't make a match. if(subentity ~ /^creator$|^subject/ && token != "_birth" && token != "_death") { if(G["identifier"] ~ debugid && debug && stopflag == 0) { print " " subentity ": " G[subentity] stopflag = 1 #for(z in TokensFull) #print z " = " TokensFull[z] #print "subentity = " subentity #print "token = " token } cc = split(G[subentity], ca, /|<\/str>/) # when multi-names are separated by /<\/str> G[subentity] ~ // ? caindex = 2 : caindex = 1 if(G["identifier"] ~ debugid && debug) print "caindex(1) = " ca[caindex] if(ca[caindex] ~ /;/) { # when multi-names are separated by ; ..this may break in certain cases cc = split(ca[caindex], ca, /;/) } else if(ca[caindex] ~ /[ ]and[ ]/) { # when multi-names are separated by / and / eg. archive.org/details/UsgsBulletin507MiningDistrictsOfTheWesternUnitedStates cc = split(ca[caindex], ca, /[ ]and[ ]/) } else if(G["description"] ~ /by user tpb/ && ca[caindex] !~ /[0-9]{4}/ && G[subentity] !~ // && length(ca[caindex]) > length(fulln) + 15 ) { # when multi-names sep by , ("uploaded by user tpb") cc = split(ca[caindex], ca, /,/) } #if(G["identifier"] ~ debugid && debug) #print "caci(-2) = " ca[1] ci = 0 while(ci++ < cc) { if(ca[ci] == "") continue ca[ci] = strip(ca[ci]) #if(G["identifier"] ~ debugid && debug) #print "caci(-1) = " ca[ci] gsub(/[[]/,"",ca[ci]) gsub(/[]]/,"",ca[ci]) #if(G["identifier"] ~ debugid && debug) #print "caci(0) = " ca[ci] re = "[.|,|;|-]{0,1}[ ]{0,1}from old catalog.*$" # remove trailing "[from old catalog]" and variations if(ca[ci] ~ re) ca[ci] = rmtrail( gensub(re, "", "g", ca[ci]) ) #if(G["identifier"] ~ debugid && debug) #print "caci(0.1) = " ca[ci] if(token != "_date") { # leave date in if checking for date otherwise remove it re = "[Bb][.][ ]{0,1}" "(" daterange(TokensFull["_birth"]) ")$" # remove trailing "b. 1860" if(ca[ci] ~ re) { ca[ci] = rmtrail( gensub(re, "", "g", ca[ci]) ) } #if(G["identifier"] ~ debugid && debug) #print "caci(1) = " ca[ci] re = "(" daterange(TokensFull["_birth"]) ").+{1,3}(" daterange(TokensFull["_death"]) ")" ca[ci] = rmtrail( gensub( re, "", "g", ca[ci]) ) # remove date #if(G["identifier"] ~ debugid && debug) #print "caci(2) = " ca[ci] ca[ci] = rmtrail( gensub(/[(][^\)]*[)]$/, "", "g", ca[ci]) ) # remove final parenthesis content eg. Loudon, W. J. (William James) #if(G["identifier"] ~ debugid && debug) #print "caci(3) = " ca[ci] re = "(" daterange(TokensFull["_birth"]) ")[-]{0,1}$" # remove trailing "1860-" ie. when no death date is given if(ca[ci] ~ re) { ca[ci] = rmtrail( gensub(re, "", "g", ca[ci]) ) #if(G["identifier"] ~ debugid && debug) { #print "re = " re #print "caci(4) = " ca[ci] #} } ca[ci] = tolower(ca[ci]) ca[ci] = rmtrail( gensub("sir$", "", "g", ca[ci]) ) # remove trailing "Sir" ca[ci] = strip( gensub("^sir ", "", "g", ca[ci]) ) # remove leading "Sir " ca[ci] = strip( gensub(/annotator|translator/, "", "g", ca[ci]) ) # remove various } if(token != "_date") { re = "^" regesc2(tolower(TokensFull[token])) "$" # eg. "^william t[.] volk$" gsub("-","",re) # Is this needed? } else { # eg, "1860.+{1}1910" which is like 1860?1910 - will match "#-#", "# - #", "# #", etc.. also catch "b. ####$" and "####-$" re = "(" daterange(TokensFull["_birth"]) ").+{1,3}(" daterange(TokensFull["_death"]) ")|([Bb][.][ ]{0,1}(" daterange(TokensFull["_birth"]) ")$)|((" daterange(TokensFull["_birth"]) ")[-]{0,1}$)" } agrep_debug = 0 if(G["identifier"] ~ debugid && debug) { print " Step 1 (" token "): " tolower(ca[ci]) " | " re agrep_debug = 1 } if(token != "_date" && agrep(ca[ci], re, ".25", agrep_debug, "regex", length(TokensFull[token]) ) ) { dm = 1 if(subentity ~ /^creator$/ && tolower(G["identifier"]) ~ /jstor/) Scope[idstring][G["identifier"]]["jstor"] = 1 # 1 if creator field matches and JSTOR } # Check for lastname: ^roy|[ ]roy|[(]roy re2 = "^" regesc2(tolower(TokensFull["_lastname"])) "|[ ]{1,}" regesc2(tolower(TokensFull["_lastname"])) "|[(]" regesc2(tolower(TokensFull["_lastname"])) if(token == "_date" && match(ca[ci], re) && agrep(ca[ci], re2, ".25", agrep_debug, "regex", length(TokensFull["_lastname"]) ) ) { # match date + lastname dm = 1 Scope[idstring][G["identifier"]]["date"] = 1 # 1 if work is matched on a date } if(dm == 1) { if(G["identifier"] ~ debugid && debug) { print " Matched:" print " +1 for " token " in " subentity >> debugo } print " " G["identifier"] " | " subentity " |" TokensFull[token] "| = " ca[ci] " (c)" count++ dm = 0 tokenflag = 1 } } continue } agrep_debug = 0 if(G["identifier"] ~ debugid && debug) agrep_debug = 1 agrep_type = "regex" agrep_weight = "0.25" if(subentity ~ /^description$|^title$/) { if(token !~ /_date|_birth|_death/ ) { if(TokensFull[token] ~ /[.]/) { re = tolower(TokensFull[token]) agrep_type = "plain" agrep_weight = "0.10" } else re = "^" regesc2(tolower(TokensFull[token])) "|[ ]{1,}" regesc2(tolower(TokensFull[token])) "|[(]" regesc2(tolower(TokensFull[token])) } else re = regesc2(tolower(TokensFull[token])) } else { re = regesc2(tolower(TokensFull[token])) } if( agrep(tolower(G[subentity]), re, agrep_weight, agrep_debug, agrep_type, length(TokensFull[token]) ) && token != "_birth" && token != "_death" ) { count++ if(G["identifier"] ~ debugid && debug) { print " Matched:" print "+1 for " token " (" TokensFull[token] ") in " subentity ": " G[subentity] >> debugo } print " " G["identifier"] " | " subentity " |" TokensFull[token] "| = " G[subentity] " (d)" tokenflag = 1 } } } if(datehit(G,TokensFull["_birth"],count) == 0) # This must be the last modifier to count count-- if(G["identifier"] ~ debugid && debug) close(debugo) print G["identifier"] " | " count if(count > 0) return 1 return -1 } # Return 1 if book year is within the range author's year of birth + 20 -> year of death function datehit(G,birth,count) { # Don't penalize audio/video collections which don't have historic dates if(G["mediatype"] !~ "texts") return 1 # Don't penalize Project Gutenberg which have no dates if(tolower(G["description"]) ~ /project gutenberg/) return 1 if(birth < 1830 ) # Don't penalize old authors since most of their books on IA will be later-period reprints return 1 if(G["date"] == "" && count == 1) { # Scrape date only if count is 1 - anything more/less won't matter. This should speed up processing. G["date"] = scrapedate(G["identifier"]) } print "\nDATE = " G["date"] if(G["date"] == "") return 0 if(G["date"] >= TokensFull["_birth"] + 20 && G["date"] <= TokensFull["_death"]) return 1 return 0 } function init_searchtokens_full(qin, firstn, lastn ,a,c,d,e,f,h,j,k,head,dm,safe,aoc,aoa,aoaa,aoi,andorstr) { delete TokensFull TokensFull["_lastname"] = lastn TokensFull["_firstname"] = firstn # Populate _first/lastname_special when a unusual search string. See test.awk specialtest() for checking test cases. # Note: these are specific and need to be modified if search string format (qin) ever changes ie. in iaa.lua aoc = patsplit(qin, aoa, /AND|OR/) while(aoi++ < aoc) andorstr = andorstr " " aoa[aoi] andorstr = strip(andorstr) if(andorstr ~ /AND OR OR AND OR|AND OR AND|AND OR OR AND|AND OR AND OR/ ) { split(qin,aoa,/AND|OR/) gsub(/[)]|[(]|["]/,"",aoa[2]) aoi = split(aoa[2],aoaa," ") TokensFull["_firstname_special"] = strip(aoaa[1]) TokensFull["_lastname_special"] = strip(aoaa[aoi]) } else if(andorstr ~ "AND OR AND OR OR AND") { split(qin,aoa,/AND|OR/) gsub(/[)]|[(]|["]/,"",aoa[2]) gsub(/[)]|[(]|["]/,"",aoa[4]) TokensFull["_firstname_special"] = strip(aoaa[2]) TokensFull["_lastname_special"] = strip(aoaa[4]) } # AND OR OR AND OR # 5+word extended ascii # example: (-mediatype:software) AND (((Claude Prosper Jolyot de Crébillon) OR (Claude Prosper Jolyot de Cr*billon)) OR ("1707-1777" AND ("Crébillon" OR "Crebillon"))) # AND OR OR AND # 5+word extended ascii # example: (-mediatype:software) AND ((François Christophe Edmond de Kellermann OR (Fran*ois Christophe Edmond de Kellermann)) OR ("1802-1868" AND "Kellermann")) # AND OR AND OR # 5+word extended ascii # example: (-mediatype:software) AND ((Claude Prosper Jolyot de Crébillon) OR ("1707-1777" AND ("Crébillon" OR "Crebillon"))) # AND OR AND # 5+word extended ascii # example: (-mediatype:software) AND ((François Christophe Edmond de Kellermann) OR ("1802-1868" AND "Kellermann")) # AND OR AND OR OR AND # First char extended ascii special search # example: (-mediatype:software) AND ((("Étienne" OR "Etienne") AND ("Aignan" OR "Aignan")) OR ("1773-1824" AND "Aignan")) c = split(qin, a, "\"") offset = 0 if(length(a[1]) > 0) offset = 1 i = 0 + offset while(i++ < c) { if(a[i] ~ /^[0-9]{4}-[0-9]{4}$/) { TokensFull["_date"] = a[i] i = i + offset split(TokensFull["_date"], h, "-") TokensFull["_birth"] = h[1] TokensFull["_death"] = h[2] continue } # Filter out 1-word names (unless name really is 1 word) if(firstn !~ lastn) { d = split(a[i],e," ") if(d == 1) { i = i + offset continue } } # Filter out when last character is "." eg. "Jéquier, G." if( substr(a[i], length(a[i]), 1) ~ /[.]/ ) { i = i + offset continue } # Filter out duplicates caused by extended ascii eg. Gustave Jéquier / Gustave Jequier (note the é/e) # because agrep will handle these cases in find_searchtokens(). This will break badly if search syntax ever changes. if(a[i - 1] ~ /[:]$/) TokensFull[a[i]] = a[i] i = i + offset } # Remove all "-" (same thing done in find_searchtokens()) for(head in TokensFull) TokensFull[head] = strip(gensub("-"," ","g",TokensFull[head])) print qin print " |5-----|" for(head in TokensFull) print head " = " TokensFull[head] print " |6-----|" } # # highlight search tokens # function hightokens(str ,c,arr,i,t,out,build,work) { c = split(str, arr, " ") while(i < c) { i++ out = "" work = arr[i] gsub("([,]|\"|[.]$|\"[.]$|'[.]$|`[.]$|;|[:]|^'|'$|^`|`$|'s$|'s[.]$)","",work) # <-- Customize search token exceptions for ( t in Tokens ) { if ( tolower(t) == tolower(work) ) { out = "" arr[i] "" break } } out == "" ? out = arr[i] : "" i == 1 ? build = out : build = build " " out } return build } function init_searchtokens(iaquery ,arr, arrg, i, k, c) { delete Tokens gsub("\"","",iaquery) gsub("+"," ",iaquery) split(iaquery, arr, "+") k = join(arr, 1, len2(arr), " ") c = split(k, arrg, " ") while ( i++ < c) { if ( arrg[i] !~ "(AND|OR|:)" ) { # <-- Customize tokens to ignore eg. logic statements, etc.. gsub(/[,]+$/,"",arrg[i]) gsub(/[)]+$/,"",arrg[i]) gsub(/^[(]+/,"",arrg[i]) Tokens[arrg[i]] = arrg[i] } } } # # If date of work is empty, it may be due to bug at Internet Archive caused when date field contains non-ascii characters it returns empty date. # See examples paramountmethodf00fode ("c1922") and greeceallies191400abbouoft ("[1922]") # This function scrapes the main work page searching for "stated date is 1922" in copyright-notice, or failing that looking at the meta.xml file. # function scrapedate(wid, page,a,b,c,d,e,metaxml,debug,debugid,debugo,widu,fname) { # Enable debug to examine how it handles individual work IDs. debug = 0 # 1=on, 0=off debugid = "visionbowditchhp00bowdrich" debugo = "/home/adminuser/wi-awb/testpage" widu = gensub(/ /,"_","g",wid) # Convert " " to "_". Should never happen but in case. # Read from local cache, if not exist load from web and save to cache fname = TDir "iascache/" widu if( checkexists( fname ) ) { page = readfile( fname ) } else { page = http2var2("http://archive.org/details/" widu) print page > fname close( fname ) } if( match(page, "stated date is [0-9]{4}", a) ) { # If it has a "stated date is XXXX" on the main work page, use that. if(wid ~ debugid && debug) print " Step 0: " a[0] return strip( substr(a[0], length(a[0]) - 4, length(a[0]) ) ) } if(wid ~ debugid && debug) { print page > debugo close(debugo) } # Otherwise, get the meta.xml file and see if it has a pair # Read from local cache, if not exist load from web and save to cache fname = TDir "iascache/" widu "_meta.xml" if( checkexists( fname ) ) { metaxml = readfile( fname ) } else { metaxml = http2var2("http://archive.org/download/" widu "/" widu "_meta.xml") print metaxml > fname close( fname ) } match(metaxml, /[<]collection[>][^<]+[<][/]collection[>]/, e) if(tolower(e[0]) ~ /citebank/) { # Special case for citebank if( match(metaxml, /[<]volume[>][^<]+[<][/]volume[>]/, f) ) { if(wid ~ debugid && debug) print " Step 2: " f[0] if( match(f[0], /[0-9]{4}/, d) ) { return strip(d[0]) } } else { if( match(metaxml, /[<]year[>][^<]+[<][/]year[>]/, c) ) { if(wid ~ debugid && debug) print " Step 2: " c[0] if( match(c[0], /[0-9]{4}/, d) ) { return strip(d[0]) } } } } if( match(metaxml, /[<]date[>][^<]+[<][/]date[>]/, c) ) { # Check for if(wid ~ debugid && debug) print " Step 2: " c[0] if( match(c[0], /[0-9]{4}/, d) ) { return strip(d[0]) } } # If still nothing, check for if( match(metaxml, /[<]year[>][^<]+[<][/]year[>]/, c) ) { if(wid ~ debugid && debug) print " Step 2: " c[0] if( match(c[0], /[0-9]{4}/, d) ) { return strip(d[0]) } } # If still nothing, check if a date is in publisher field eg. see imperialgazette00meyegoog if( match(metaxml, /[<]publisher[>][^<]+[<][/]publisher[>]/, c) ) { if(wid ~ debugid && debug) print " Step 2: " c[0] if( match(c[0], /[0-9]{4}/, d) ) { if(strip(d[0]) > 1200 && strip(d[0]) < MaxYear ) { return strip(d[0]) } } } if(wid ~ debugid && debug) print " Step 3: none found" return "" } # # Build a passable XML file due to bug in Internet Archive (in some cases) triggered by unknown reasons. # 1. create a fake XML header # 2. download list of identifier's in CSV format (this works for some reason but XML doesn't) # 3. for each identifier, request *individual* XML (one for each work) and extract the portion between # 4. build a complete XML from the parts # function build_xml(head,tail,entity,rows,sort,pagenum, csv,out,subxml,sc,doc,a,i,k) { qin = urlendecode(entity,"decode") # XML header out = "" "\n" out = out "" "\n" out = out "" "\n" out = out "0" "\n" out = out "89" "\n" out = out "" "\n" out = out "" "\n" out = out "xml" "\n" out = out "" rows "" "\n" out = out "" qin "" "\n" out = out "collection,creator,date,description,downloads,identifier,mediatype,publisher,title,year" "\n" out = out "0" "\n" out = out "" "\n" out = out "" "\n" out = out "" "\n" # Get CSV version (only content type that works) tail = "&fl[]=identifier&rows=" rows "&output=csv&callback=callback&save=yes&page=" pagenum url = head entity tail csv = http2var2(url) tail = "&fl[]=date&fl[]=publisher&fl[]=collection&fl[]=creator&fl[]=description&fl[]=downloads&fl[]=identifier&fl[]=mediatype&fl[]=subject&fl[]=title&sort[]=" sort "&sort[]=&sort[]=&rows=1&output=xml&callback=callback&save=yes&page=" pagenum c = split(csv, a, "\n") while(i++ < c) { if(i == 1) continue # Skip CSV header gsub(/"/,"",a[i]) # Remove quotes around CSV field entity = "identifier%3A%22" a[i] "%22" url = head entity tail subxml = http2var2(url) gsub(/</,"<",subxml);gsub(/>/,">",subxml);gsub(/"/,"\"",subxml);gsub(/&/,"\\&",subxml) sc = split(subxml, doc, "(|)") if(length(doc[2]) > 10) { out = out "" "\n" out = out doc[2] out = out "" "\n" } } out = out "" "\n" out = out "" "\n" gsub(/numFound="15"/,"numFound=\"" c - 1 "\"",out) return out } # # Search Wikipedia article for book title. Return true if match found. # False positives may arise here if the book title is short or simple phrase # function searchwikipedia(booktitle) { if( agrep( WPArticle, extracttitle(booktitle), ".25") ) return 1 return 0 } # # Srtip a book title (from IA) down to its bare essential (rm subtitle etc) so that it can be searched for in the WP article. # function extracttitle(tit, c,l,f,h,k,o,p) { delete p # Convert any XML codes, or ";" gsub(/>/,">",tit) gsub(/"/,"\"",tit) gsub(/&/,"\\&",tit) gsub(";",":",tit) # Rm all the words which follow one of these characters split(tit, h, "(—| - |[(]|:)") tit = h[1] # Rm special cases gsub(", in [Ff]our [Pp]arts.*","",tit) gsub("in [Ff]our [Pp]arts.*","",tit) gsub(", in [Tt]hree [Pp]arts.*","",tit) gsub("in [Tt]hree [Pp]arts.*","",tit) gsub(", in [Tt]wo [Pp]arts.*","",tit) gsub("in [Tt]wo [Pp]arts.*","",tit) gsub(/[Mm]lle[.]/, "Mademoiselle",tit) # Rm words following "." except in certain cases l = f = k = 0 k = split(tit,o," ") while(l < k) { l++ if(o[l] ~ /[.]/) { # Bypass these allowed words if(o[l] ~ /Mrs[.]|Mr[.]|[A-Z][.]|Inc[.]|Dr[.]|Capt[.]|doma[.]|St[.]|No[.]/) { f++ p[f] = o[l] } else { # Keep first occurance of non-allowed "word." and drop the remaining words f++ p[f] = o[l] p[f] = substr(p[f],0,length(p[f]) - 1) # Rm trailing "." break } } else { f++ p[f] = o[l] } } tit = join(p, 1, length(p), " ") # Rm words following "," except if first word (eg. "Sidonia, the Sorceress") f = l = k = 0 delete p k = split(tit,o," ") while(l < k) { l++ if(o[l] ~ /[,]/) { # Keep if first word if(f == 0) { f++ p[f] = o[l] } else { f++ p[f] = o[l] p[f] = substr(p[f],0,length(p[f]) - 1) break } } else { f++ p[f] = o[l] } } tit = join(p, 1, length(p), " ") # Rm last word if "by" delete p f = 0 c = split(tit, o, " ") if(strip(o[c]) ~ "[Bb]y") { while(f < c - 1) { f++ p[f] = o[f] } tit = join(p, 1, length(p), " ") } # Rm second to last word if "by" delete p f = 0 c = split(tit, o, " ") if(strip(o[c - 1]) ~ "[Bb]y") { while(f < c - 2) { f++ p[f] = o[f] } tit = join(p, 1, length(p), " ") } return strip(tit) } # # Build a RE with a date range # function daterange(givendate, re) { if(givendate) re = givendate -3 "|" givendate -2 "|" givendate -1 "|" givendate "|" givendate + 1 "|" givendate + 2 "|" givendate + 3 else re = "a1b2c3d4e5f6g7h8i9j0" # nonsense string if no date ie. no match will be made return re } # _________________________ utilities _______________________________________________________ # See library.awk for others # # http2var - replicate "wget -q -O- http://..." in pure gawk # Return the HTML page as a string. # function http2var2(url) { return clean( sys2var( Exe["wget"] " --user-agent=\"" Agent "\" -q -O- \"" url "\"") ) } # # Remove certain trailing characters from a string # function rmtrail(str) { str = strip(str) if( substr( str, length(str) - 2, length(str) ) ~ /[(][)]/ ) # remove trailing () str = substr( str, 1, length(str) - 2 ) if( substr( str, length(str) - 1, length(str) ) ~ /,|;/ ) # remove trailing , or ; str = substr( str, 1, length(str) - 1 ) return strip(str) } #---------------------------------------------------- # Approximate (fuzzy) matching using agrep # # source = source text # search = text to search for in source # percent = maximum error rate percentage of search. # ie. if source is 12 characters and max error rate is 25%, set to ".25" # and it will return a match if up to 3 characters are wrong. # debug = if "1", print debug statement. # stype = search string is "regex" or "plain" text. Or "exact" for exact match (case-insensitive) # rlength = optional. Length of string without regex characters. Use if using "regex". # # Error rate is hard coded: max out at "6" on the upper and "1" on the lower. # Agrep set to case-insensitive # # Return 0 if no match, otherwise number of matches # #---------------------------------------------------- function agrep(source, search, percent, debug, stype, rlength, slength,errorlimit,results,s,command) { if(stype == "") stype = "plain" slength = length(search) if(rlength == "" || rlength == 0) rlength = slength if(stype == "regex") slength = rlength # Limit # of errors to 25% of length of str, or no more than 6, whichever is less if(slength > 24) errorlimit = 6 else errorlimit = int(slength * percent) if(errorlimit < 2) { if(slength < 6) errorlimit = 1 else errorlimit = 2 } gsub("\"","\\\"",search) # Escape any " marks if( substr(search,slength,1) ~ /\\/ ) { # If last character is " .. s = substr(search, 1, slength - 1) search = s } if(stype == "regex") command = Exe["agrep"] " -i -c -" errorlimit " -- \"" agrepstrip( strip(search) ) "\"" else if(regex == "exact") command = Exe["agrep"] " -i -k -c -0 -- \"" agrepstrip( strip(search) ) "\"" else if(stype == "plain") command = Exe["agrep"] " -i -k -c -" errorlimit " -- \"" agrepstrip( strip(search) ) "\"" else return 0 if(debug) print "Agrep command = " command print agrepstrip(source) |& command close(command, "to") command |& getline results close(command) if(results > 0) return results else return 0 } # # Length of an array. Portable function for older versions of gawk # function len2(array, i) { i = 1 while (i in array) { i++ } return i - 1 } # # strip HTML (method has general limitations but OK for this app) # function striphtml(s) { gsub (/<[^>][^>]*>/, "", s) return s } # # Remove problem shell characters when running agrep # function agrepstrip(str) { return gensub(/[`]/, "", "g", str) } # # Count number of occurances of word in str .. presumes space separated # function occurances(str, word, count,c,a,i) { count = 0 c = split(str,a," ") while(i++ < c) { if(a[i] == word) count++ } return count } # # Escape regex symbols # function regesc2(str, safe) { safe = str gsub(/[][^$*?+{}\\()|]/, "[&]", safe) gsub("[\\^]","\\^",safe) # replace "[^]" with "[\^]" gsub("[.]","[&]{0,1}",safe) # replace "." with "[.]{0.1}" return safe }