#!/bin/awk -f @load "readfile" @load "filefuncs" BEGIN { # ################################################################################################ # # Internet Archive Search (ias). Adapted from iacs (Internet Archive Classic Search) # Stephen Balbach October 2015 # # Pass variables: # # -v id="/home/stbalbach/wi-awb/temp/wi-awb-1026211441/" # -v id="/home/stbalbach/wi-awb/temp/wi-awb-1031200119/" # # # The purpose of this program is to search each book's metadata from a particular IA search # and determine if it belongs to the intended author. It will search the first 50 results # or whatever "Rows" is set to. It returns the number of books that it things match that author. # # # ____________________________________________________________________________________________ # # Configuration variables: # # 1. Program filename - the name of the script. MyProg = "ias.awk" # 2. Run as a "standalone" web server on a PC; or as a "cgi" script on an existing webserver. # RunType = "standalone" # 3. If RunType = "standalone" set the following two variables. # # 3.1. Port to run the server on. Can be anything. May need to modify your firewall to allow access. # Ignore this is if RunType is "cgi". MyPort = "8080" # 3.2 Hostname - either a working domain name, a static IP, or if running # on the same computer you're browsing from in "standalone" mode set to "localhost". # Ignore this if RunType is "cgi". MyHost = "localhost" # 4. If RunType = "cgi" set the following two variables: # # 4.1 URL prefix. Don't include a trailing slash. Ignore if RunType is "standalone" # URLPrefix = "http://mydomain.net/cgi-bin" # 4.2 Do you have wget on your system? Set to "yes" or "no". If you don't know, set to "?". # Setting this will speed up the script so it doesn't have to check if wget exists with each page # load when in "cgi" mode. # If running in "standalone" mode on Windows set to "no" or "?" # Wget is an alternative because it handles broken network connections more gracefully than awk # but otherwise is not required, though recommended for cgi installs. WTA = "wget" # 5. Number of results per page (50 was the IA default). Can by anything based on # your computer speed and memory etc.. Rows = 200 # 6. Page index block size. This is the number of pages displayed in the index bar before # the "Next" button. PBSize = 20 # # 7. URL Agent string - this is what Internet Archive sees in their logs. Recommend # setting the name of the program and your contact information, such as an email # or your Internet Archive userid. Agent = "Internet Archive Classic Search (iacs2015@nym.hush.com)" TDir = id AGREP = "/home/stbalbach/tre-0.8.0/src/agrep" # # Maximum year a book can be published. ie. if a book publication year is beyond this then reject it as garbage data # # MaxYear = 2030 # # 8. To run in "standalone" mode: # Start the server: awk -f iacs.awk # To access: http://:/ # eg. http://localhost:8080/ # # # ################################################################################################## delete G main() } function main( s,c,a,b,k,kname,mname,fulln,ktype,khits,m,n,o,d,count,debug,debugid) { debug = 0 # 1=on, 0=off debugid = "tmpO@Étienne_Geoffroy_Saint-Hilaire@t@2" # Specified in this format WPArticle = readfile(TDir "article.txt") s = readfile(TDir "ia-results") c = split(s, a, "\n") while (i++ < c) { if(a[i] != "") { # remove blank lines split(a[i], b, "|") if(b[2] != 0) # remove 0 hit results k[b[3]][b[1]][b[2]] = 1 # name|type|hits } } for(kname in k) { m[kname] = kname } for(mname in m) { #print "mname = " mname for(kname in k) { #print "kname = " kname if( kname ~ "^" mname "$" ) { #print "MATCHED" for(ktype in k[kname]) { #print "ktype = " ktype for(khits in k[kname][ktype]) { #print "khits = " khits if(ktype == "none") n[4] = "tmpO@" gensub(" ","_","g",kname) "@" ktype "@" if(ktype == "w") n[3] = "tmpO@" gensub(" ","_","g",kname) "@" ktype "@" if(ktype == "t") n[2] = "tmpO@" gensub(" ","_","g",kname) "@" ktype "@" if(ktype == "tx") n[1] = "tmpO@" gensub(" ","_","g",kname) "@" ktype "@" fulln = kname } } } } PROCINFO["sorted_in"] = "@ind_num_asc" for(o in n) { # print "n[o] = " n[o] c = 0 while(c++ < 20) { if(exists(TDir n[o] c) == 1) break } s = readfile(TDir n[o] c) print "________________________________________________________________________________________________________________________________" print n[o] c print "________________________________________________________________________________________________________________________________" split(gensub("[[]http://archive.org/search.php[?]query=","","g",s),a," ") d = split(mname,b," ") count = 0 if(b[1] !~ /^Sir$/) { # Skip names that begin with "Sir Whatever" as they produce odd results if(debug) { re = n[o] c if(debugid ~ re) count = search_ia(a[1],1,"",b[1],b[d],fulln) } else count = search_ia(a[1],1,"",b[1],b[d],fulln) } else { count-- } print " |1-----|" print count result[n[o] c] = count } delete n } print " |2-----|" PROCINFO["sorted_in"] = "@val_num_desc" delete b b[1]["name"] = "" b[1]["count"] = 0 for(o in result) { print o " = " result[o] if(result[o] > b[1]["count"]) { b[1]["count"] = result[o] b[1]["name"] = o } if(result[o] == b[1]["count"]) { if(length(o) > length(b[1]["name"]) ) { b[1]["count"] = result[o] b[1]["name"] = o } } } if(length(b[1]["name"]) > 0) { print b[1]["name"] > TDir "ias.out" close("ias.out") } } # # Search Internet Archive and format results. # function search_ia(entity, pagenum, sort, firstn, lastn, fulln, head,tail,url,xml,c,numfound,doc,page,arr,arrg,arrgh,qin,G,i,count,tokenhits) { head = "http://archive.org/advancedsearch.php?q=" tail = "&fl[]=date&fl[]=publisher&fl[]=collection&fl[]=creator&fl[]=description&fl[]=downloads&fl[]=identifier&fl[]=mediatype&fl[]=subject&fl[]=title&sort[]=" sort "&sort[]=&sort[]=&rows=" Rows "&output=xml&callback=callback&save=yes&page=" pagenum url = head entity tail # Download XML from Internet Archive xml = http2var(url) if ( xml == "" || length(xml) == 0) { print "Error in function search_ia: Unable to retrieve data from Internet Archive." > "/dev/stderr" return -1 } if(length(xml) < 100) { # Bug in Internet Archive. print "Warning: XML bug. Using build_xml()" xml = build_xml(head,tail,entity,Rows,sort,pagenum) } gsub(/</,"<",xml);gsub(/>/,">",xml);gsub(/"/,"\"",xml);gsub(/&/,"\\&",xml) c = split(xml, doc, "(|)") # numFound="432" match(xml, "numFound=\"[0-9]+\"", arr) split(arr[0], arrg, "\"") Numfound = arrg[2] # name="qin">text< (search string as reported by IA) split(doc[1], arr, "(name=\"qin\">)") split(arr[2], arrg, "") qin = arrg[1] init_searchtokens_full(qin,firstn,lastn) i = 0 while(i < (Rows * 2) ) { i = i + 2 delete G # identifier (name="identifier">presidentgarfiel00hinsuoft<) match(doc[i], "name=\"identifier\">[^<]+<", arr) split(arr[0],arrg,"(<|>)") G["identifier"] = arrg[2] if ( arrg[2] == "" ) continue # Skip if no id # creator and subject (if multiple names they are surrounded by , remove those and leave each name surrounded by ) ad = 0 ac = split(doc[i], arr, "\n") # possible point of failure if XML format ever stoped having a /n after each line while(ad++ < ac) { if(arr[ad] ~ // ) { gsub(//,"",arr[ad]) gsub(/<\/arr>/,"",arr[ad]) G["creator"] = strip(arr[ad]) continue } if(arr[ad] ~ // ) { gsub(//,"",arr[ad]) gsub(/<\/str>/,"",arr[ad]) G["creator"] = strip(arr[ad]) continue } if(arr[ad] ~ // ) { gsub(//,"",arr[ad]) gsub(/<\/arr>/,"",arr[ad]) G["subject"] = strip(arr[ad]) continue } if(arr[ad] ~ // ) { gsub(//,"",arr[ad]) gsub(/<\/str>/,"",arr[ad]) G["subject"] = strip(arr[ad]) continue } } # description (name="description">text<) split(doc[i], arr, "(") G["description"] = striphtml(substr(arrg[1], 2)) # date (name="date">1881-01-01T00:00:00Z) match(doc[i], "name=\"date\">[^<]+<", arr) split(arr[0],arrg,"(<|>)") split(arrg[2],arrgh,"-") G["date"] = strip(arrgh[1]) if(G["date"] == "") G["date"] = scrapedate(G["identifier"]) # mediatype (name="mediatype">texts<) match(doc[i], "name=\"mediatype\">[^<]+<", arr) split(arr[0],arrg,"(<|>)") G["mediatype"] = arrg[2] # title (name="title">Title<) match(doc[i], "name=\"title\">[^<]+<", arr) split(arr[0],arrg,"(<|>)") G["title"] = arrg[2] count = count + find_searchtokens_full(G, fulln) if(numfound <= Rows) # Abort early - no chance base = numfound else base = Rows if(count < ( ( (base / 2) * -1) -1) ) return -1001 } return count } function find_searchtokens_full(G, fulln, token,subentity,count,cc,ci,ca,da,dc,di,dm,re,stopflag) { # Enable debug to examine how it scores individual work IDs. Some output saved to file "test" (overwritten with each run) debug = 1 # 1=on, 0=off debugid = "souvenirsdelama04courgoog" if(G["identifier"] ~ debugid && debug) print "Starting" > "test" dc = split(TokensFull["_date"], da, " ") count = 0 if( searchwikipedia(G["title"]) ) { # If the book title is in the Wikipedia article.. count++ print " " G["identifier"] " | title | (wikipedia article) | = " G["title"] if(G["identifier"] ~ debugid && debug) print "+1 in special 1-2 for " subentity >> "test" } for(subentity in G) { if(subentity == /identifier/ ) # Skip searching these fields continue if(subentity == "mediatype") { # Skip mediatype software if(G[subentity] == "software") { return 0 } } if(subentity == "identifier") { # Skip arxiv records if(G[subentity] ~ /^arxiv/) { return 0 } } # Remove all "-" (same thing done in init_searchtokens()) G[subentity] = strip(gensub("-"," ","g",G[subentity])) stopflag = 0 for(token in TokensFull) { if(TokensFull[token] == "") continue if(token ~ (/^_lastname$|^_firstname$|^_firstname_special2$|^_lastname_special1$|^_lastname_special2$/)) { if(G["identifier"] ~ debugid && debug) print "Skipping " token " in " subentity >> "test" continue } if(token == "_date" && subentity !~ /^creator$|^subject/) { # See below for creator/subject + date check re = daterange(TokensFull["_birth"]) ".+{1,3}" TokensFull["_death"] if( match(G[subentity], re) && match(tolower(G[subentity]), tolower(TokensFull["_lastname"]) ) ) { count++ if(G["identifier"] ~ debugid && debug) print "+1 in _date: G[" subentity "]=|" G[subentity] "| TokensFull[" token "]=|" TokensFull[token] "| lastname=|" TokensFull["_lastname"] "|" >> "test" continue } } if(token == "_firstname_special1") { if( match(tolower(G[subentity]), tolower(TokensFull["_firstname_special1"])) && match(tolower(G[subentity]), tolower(TokensFull["_lastname_special1"])) ) { count++ if(G["identifier"] ~ debugid && debug) print "+1 in special 1-1 for " subentity >> "test" continue } if( match(tolower(G[subentity]), tolower(TokensFull["_firstname_special1"])) && match(tolower(G[subentity]), tolower(TokensFull["_lastname_special2"])) ) { count++ if(G["identifier"] ~ debugid && debug) print "+1 in special 1-2 for " subentity >> "test" continue } if( match(tolower(G[subentity]), tolower(TokensFull["_firstname_special2"])) && match(tolower(G[subentity]), tolower(TokensFull["_lastname_special1"])) ) { count++ if(G["identifier"] ~ debugid && debug) print "+1 in special 2-1 for " subentity >> "test" continue } if( match(tolower(G[subentity]), tolower(TokensFull["_firstname_special2"])) && match(tolower(G[subentity]), tolower(TokensFull["_lastname_special2"])) ) { count++ if(G["identifier"] ~ debugid && debug) print "+1 in special 2-2 for " subentity >> "test" continue } } # If creator or subject string contains a date, and it doesn't match one of the birth-death dates in TokensFull, then don't make a match. if(subentity ~ /^creator$|^subject/ && token != "_birth" && token != "_death") { if(G["identifier"] ~ debugid && debug && stopflag == 0) { print " " subentity ": " G[subentity] stopflag = 1 } cc = split(G[subentity], ca, /|<\/str>/) # when multi-names are separated by /<\/str> G[subentity] ~ // ? caindex = 2 : caindex = 1 if(ca[caindex] ~ /;/) { # when multi-names are separated by ; ..this may break in certain cases cc = split(ca[caindex], ca, /;/) } else if(ca[caindex] ~ /[ ]and[ ]/) { # when multi-names are separated by / and / eg. archive.org/details/UsgsBulletin507MiningDistrictsOfTheWesternUnitedStates cc = split(ca[caindex], ca, /[ ]and[ ]/) } else if(G["description"] ~ /by user tpb/ && ca[caindex] !~ /[0-9]{4}/ && G[subentity] !~ // && length(ca[caindex]) > length(fulln) + 15 ) { # when multi-names sep by , ("uploaded by user tpb") cc = split(ca[caindex], ca, /,/) } ci = 0 while(ci++ < cc) { if(ca[ci] == "") continue ca[ci] = strip(ca[ci]) gsub(/[[]/,"",ca[ci]) gsub(/[]]/,"",ca[ci]) re = "[.|,|;|-]{0,1}[ ]{0,1}from old catalog.*$" # remove trailing "[from old catalog]" and variations if(ca[ci] ~ re) ca[ci] = rmtrail( gensub(re, "", "g", ca[ci]) ) re = "[Bb][.][ ]{0,1}" daterange(TokensFull["_birth"]) "$" # remove trailing "b. 1860" if(ca[ci] ~ re) ca[ci] = rmtrail( gensub(re, "", "g", ca[ci]) ) if(token != "_date") { # leave date in if checking for date otherwise remove it ca[ci] = rmtrail( gensub(TokensFull["_date"], "", "g", ca[ci]) ) # remove date ca[ci] = rmtrail( gensub(/[(][^\)]*[)]$/, "", "g", ca[ci]) ) # remove final parenthesis content eg. Loudon, W. J. (William James) re = daterange(TokensFull["_birth"]) "[-]{0,1}$" # remove trailing "1860-" ie. when no death date is given if(ca[ci] ~ re) ca[ci] = rmtrail( gensub(re, "", "g", ca[ci]) ) ca[ci] = rmtrail( gensub("Sir$", "", "g", ca[ci]) ) # remove trailing "Sir" ca[ci] = strip( gensub("^Sir ", "", "g", ca[ci]) ) # remove leading "Sir " } if(token != "_date") re = "^" gensub(/[.]/, "[.]{0,1}", "g", TokensFull[token]) "$" # eg. "^William T[.] Volk$" else re = daterange(TokensFull["_birth"]) ".+{1,3}" TokensFull["_death"] # eg, "1860.+{1}1910" which is like 1860?1910 - will match "#-#", "# - #", "# #", etc.. gsub("-","",re) # Are these two lines needed? re = gensub("-","","g",re) if(G["identifier"] ~ debugid && debug) print " Step 1: " ca[ci] " | " re if(token != "_date" && match(tolower(ca[ci]), tolower(re))) dm = 1 if(token == "_date" && match(tolower(ca[ci]), tolower(re)) && match(tolower(ca[ci]), tolower(TokensFull["_lastname"])) ) # match date + lastname dm = 1 if(dm == 1) { if(G["identifier"] ~ debugid && debug) print " Matched:" print " " G["identifier"] " | " subentity " |" TokensFull[token] "| = " ca[ci] count++ dm = 0 if(G["identifier"] ~ debugid && debug) print " +1 for " token " in " subentity >> "test" } } continue } re = gensub(/[.]/,"[.]","g",TokensFull[token]) if(match(tolower(G[subentity]), tolower(re)) && token != "_birth" && token != "_death" ) { print " " G["identifier"] " | " subentity " |" TokensFull[token] "| = " G[subentity] count++ if(G["identifier"] ~ debugid && debug) { print "+1 for " token " (" TokensFull[token] ") in " subentity ": " G[subentity] >> "test" } } } } if(datehit(G,TokensFull["_birth"]) == 0) count-- if(G["identifier"] ~ debugid && debug) close("test") print G["identifier"] " | " count if(count > 0) return 1 return -1 } # Return 1 if book year is within the range author's year of birth + 20 -> year of death function datehit(G,birth) { print "\nDATE = " G["date"] if(G["mediatype"] !~ "texts") # Don't penalize audio/video collections which don't have historic dates return 1 if(birth < 1830 ) # Don't penalize old authors since most of their books on IA will be later-period reprints return 1 if(G["date"] == "") return 0 if(G["date"] >= TokensFull["_birth"] + 20 && G["date"] <= TokensFull["_death"]) return 1 return 0 } function init_searchtokens_full(qin, firstn, lastn ,a,c,d,e,f,h) { delete TokensFull TokensFull["_lastname"] = lastn TokensFull["_firstname"] = firstn if(match(qin,/$\(\"[^\"]+\" OR \"[^\"]+\") AND \(\"[^\"]+\" OR \"[^\"]+\"$\) OR $\"[^\"]+\" AND \"[^\"]+\"$/) ) { # special case for extended ascii searches split(qin,f,"\"") TokensFull["_firstname_special1"] = f[2] TokensFull["_firstname_special2"] = f[4] TokensFull["_lastname_special1"] = f[6] TokensFull["_lastname_special2"] = f[8] } c = split(qin, a, "\"") offset = 0 if(length(a[1]) > 0) offset = 1 i = 0 + offset while(i++ < c) { if(a[i] ~ /^[0-9]{4}-[0-9]{4}$/) { TokensFull["_date"] = a[i] i = i + offset split(TokensFull["_date"], h, "-") TokensFull["_birth"] = h[1] TokensFull["_death"] = h[2] continue } d = split(a[i],e," ") if(d == 1) { i = i + offset continue } TokensFull[a[i]] = a[i] i = i + offset } # Remove all "-" (same thing done in find_searchtokens()) for(head in TokensFull) TokensFull[head] = strip(gensub("-"," ","g",TokensFull[head])) print qin print " |5-----|" for(head in TokensFull) print head " = " TokensFull[head] print " |6-----|" } # # highlight search tokens # function hightokens(str ,c,arr,i,t,out,build,work) { c = split(str, arr, " ") while(i < c) { i++ out = "" work = arr[i] gsub("([,]|\"|[.]$|\"[.]$|'[.]$|`[.]$|;|[:]|^'|'$|^`|`$|'s$|'s[.]$)","",work) # <-- Customize search token exceptions for ( t in Tokens ) { if ( tolower(t) == tolower(work) ) { out = "" arr[i] "" break } } out == "" ? out = arr[i] : "" i == 1 ? build = out : build = build " " out } return build } function init_searchtokens(iaquery ,arr, arrg, i, k, c) { delete Tokens gsub("\"","",iaquery) gsub("+"," ",iaquery) split(iaquery, arr, "+") k = join(arr, 1, len2(arr), " ") c = split(k, arrg, " ") while ( i++ < c) { if ( arrg[i] !~ "(AND|OR|:)" ) { # <-- Customize tokens to ignore eg. logic statements, etc.. gsub(/[,]+$/,"",arrg[i]) gsub(/[)]+$/,"",arrg[i]) gsub(/^[(]+/,"",arrg[i]) Tokens[arrg[i]] = arrg[i] } } } # # If date of work is empty, it may be due to bug at Internet Archive caused when date field contains non-ascii characters it returns empty date. # See examples paramountmethodf00fode ("c1922") and greeceallies191400abbouoft ("[1922]") # This function scrapes the main work page searching for "stated date is 1922" in copyright-notice, or failing that looking at the meta.xml file itself. # # function scrapedate(wid, page,a,b,c,d,e,metaxml,debug,debugid) { # Enable debug to examine how it handles individual work IDs. debug = 0 # 1=on, 0=off debugid = "visionbowditchhp00bowdrich" page = http2var("http://archive.org/details/" wid) if( match(page, "stated date is [0-9]{4}", a) ) { # If it has a "stated date is XXXX" on the main work page, use that. if(wid ~ debugid && debug) print " Step 0: " a[0] return strip( substr(a[0], length(a[0]) - 4, length(a[0]) ) ) } if(wid ~ debugid && debug) { print page > "testpage" close("testpage") } # Otherwise, get the meta.xml file and see if it has a pair metaxml = http2var("http://archive.org/download/" wid "/" wid "_meta.xml") match(metaxml, /[<]collection[>][^<]+[<][/]collection[>]/, e) if(tolower(e[0]) ~ /citebank/) { # Special case for citebank if( match(metaxml, /[<]volume[>][^<]+[<][/]volume[>]/, f) ) { if(wid ~ debugid && debug) print " Step 2: " f[0] if( match(f[0], /[0-9]{4}/, d) ) return strip(d[0]) } else { if( match(metaxml, /[<]year[>][^<]+[<][/]year[>]/, c) ) { if(wid ~ debugid && debug) print " Step 2: " c[0] if( match(c[0], /[0-9]{4}/, d) ) return strip(d[0]) } } } if( match(metaxml, /[<]date[>][^<]+[<][/]date[>]/, c) ) { # Check for if(wid ~ debugid && debug) print " Step 2: " c[0] if( match(c[0], /[0-9]{4}/, d) ) return strip(d[0]) } # If still nothing, check for if( match(metaxml, /[<]year[>][^<]+[<][/]year[>]/, c) ) { if(wid ~ debugid && debug) print " Step 2: " c[0] if( match(c[0], /[0-9]{4}/, d) ) return strip(d[0]) } # If still nothing, check if a date is in publisher field eg. see imperialgazette00meyegoog if( match(metaxml, /[<]publisher[>][^<]+[<][/]publisher[>]/, c) ) { if(wid ~ debugid && debug) print " Step 2: " c[0] if( match(c[0], /[0-9]{4}/, d) ) { if(strip(d[0]) > 1200 && strip(d[0]) < MaxYear ) return strip(d[0]) } } if(wid ~ debugid && debug) print " Step 3: none found" return "" } # # Build a passable XML file due to bug in Internet Archive (in some cases) triggered by unknown reasons. # 1. create a fake XML header # 2. download list of identifier's in CSV format (this works for some reason but XML doesn't) # 3. for each identifier, request *individual* XML (one for each work) and extract the portion between # 4. build a complete XML from the parts # function build_xml(head,tail,entity,rows,sort,pagenum, csv,out,subxml,sc,doc,a,i,k) { qin = urldecode(entity) # XML header out = "" "\n" out = out "" "\n" out = out "" "\n" out = out "0" "\n" out = out "89" "\n" out = out "" "\n" out = out "" "\n" out = out "xml" "\n" out = out "" rows "" "\n" out = out "" qin "" "\n" out = out "collection,creator,date,description,downloads,identifier,mediatype,publisher,title,year" "\n" out = out "0" "\n" # out = out "(title:balbach^100 OR description:balbach^15 OR collection:balbach^10 OR language:balbach^10 OR text:balbach^1)" "\n" out = out "" "\n" out = out "" "\n" out = out "" "\n" # Get CSV version (only content type that works) tail = "&fl[]=identifier&rows=" rows "&output=csv&callback=callback&save=yes&page=" pagenum url = head entity tail csv = http2var(url) tail = "&fl[]=date&fl[]=publisher&fl[]=collection&fl[]=creator&fl[]=description&fl[]=downloads&fl[]=identifier&fl[]=mediatype&fl[]=subject&fl[]=title&sort[]=" sort "&sort[]=&sort[]=&rows=1&output=xml&callback=callback&save=yes&page=" pagenum c = split(csv, a, "\n") while(i++ < c) { if(i == 1) continue # Skip CSV header gsub(/"/,"",a[i]) # Remove quotes around CSV field entity = "identifier%3A%22" a[i] "%22" url = head entity tail subxml = http2var(url) gsub(/</,"<",subxml);gsub(/>/,">",subxml);gsub(/"/,"\"",subxml);gsub(/&/,"\\&",subxml) sc = split(subxml, doc, "(|)") if(length(doc[2]) > 10) { out = out "" "\n" out = out doc[2] out = out "" "\n" } } out = out "" "\n" out = out "" "\n" gsub(/numFound="15"/,"numFound=\" c - 1 \"",out) return out } # # Search Wikipedia article for book title. Return true if match found. # False positives may arise here if the book title is short or simple phrase # function searchwikipedia(booktitle) { if( agrep(tolower(WPArticle),tolower(extracttitle(booktitle)),".25") ) return 1 return 0 } # # Srtip a book title (from IA) down to its bare essential (rm subtitle etc) so that it can be searched for in the WP article. # function extracttitle(tit, c,l,f,h,k,o,p) { delete p # Convert any XML codes, or ";" gsub(/>/,">",tit) gsub(/"/,"\"",tit) gsub(/&/,"\\&",tit) gsub(";",":",tit) # Rm all the words which follow one of these characters split(tit, h, "(—| - |[(]|:)") tit = h[1] # Rm special cases gsub(", in [Ff]our [Pp]arts.*","",tit) gsub("in [Ff]our [Pp]arts.*","",tit) gsub(", in [Tt]hree [Pp]arts.*","",tit) gsub("in [Tt]hree [Pp]arts.*","",tit) gsub(", in [Tt]wo [Pp]arts.*","",tit) gsub("in [Tt]wo [Pp]arts.*","",tit) gsub(/[Mm]lle[.]/, "Mademoiselle",tit) # Rm words following "." except in certain cases l = f = k = 0 k = split(tit,o," ") while(l < k) { l++ if(o[l] ~ /[.]/) { # Bypass these allowed words if(o[l] ~ /Mrs[.]|Mr[.]|[A-Z][.]|Inc[.]|Dr[.]|Capt[.]|doma[.]|St[.]|No[.]/) { f++ p[f] = o[l] } else { # Keep first occurance of non-allowed "word." and drop the remaining words f++ p[f] = o[l] p[f] = substr(p[f],0,length(p[f]) - 1) # Rm trailing "." break } } else { f++ p[f] = o[l] } } tit = join(p, 1, length(p), " ") # Rm words following "," except if first word (eg. "Sidonia, the Sorceress") f = l = k = 0 delete p k = split(tit,o," ") while(l < k) { l++ if(o[l] ~ /[,]/) { # Keep if first word if(f == 0) { f++ p[f] = o[l] } else { f++ p[f] = o[l] p[f] = substr(p[f],0,length(p[f]) - 1) break } } else { f++ p[f] = o[l] } } tit = join(p, 1, length(p), " ") # Rm last word if "by" delete p f = 0 c = split(tit, o, " ") if(strip(o[c]) ~ "[Bb]y") { while(f < c - 1) { f++ p[f] = o[f] } tit = join(p, 1, length(p), " ") } # Rm second to last word if "by" delete p f = 0 c = split(tit, o, " ") if(strip(o[c - 1]) ~ "[Bb]y") { while(f < c - 2) { f++ p[f] = o[f] } tit = join(p, 1, length(p), " ") } return strip(tit) } # # Build a RE with a date range # function daterange(givendate, re) { re = "(" givendate -1 "|" givendate "|" givendate + 1 ")" return re } # _________________________ utilities _______________________________________________________ # # http2var - replicate "wget -q -O- http://..." in pure gawk # Return the HTML page as a string. # function http2var(url) { if ( WTA == "wget" ) return sys2var("wget --user-agent=\"" Agent "\" -q -O- \"" url "\"") } # # Run a system command and store result in a variable # eg. googlepage = sys2var("wget -q -O- http://google.com") # Supports pipes inside command string. Stderr is sent to null. # If command fails (errno) return null # Credit: Stephen Balbach # function sys2var(command ,fish, scale, ship) { command = command " 2>/dev/null" while ( (command | getline fish) > 0 ) { if ( ++scale == 1 ) ship = fish else ship = ship "\n" fish } close(command) return ship } # # Remove certain trailing characters from a string # function rmtrail(str) { str = strip(str) if( substr( str, length(str) - 2, length(str) ) ~ /[(][)]/ ) # remove trailing () str = substr( str, 1, length(str) - 2 ) if( substr( str, length(str) - 1, length(str) ) ~ /,|;/ ) # remove trailing , or ; str = substr( str, 1, length(str) - 1 ) return strip(str) } #---------------------------------------------------- # Approximate (fuzzy) matching using agrep # # source = source text # search = text to search for in source # percent = maximum error rate percentage of search. # ie. if source is 12 characters and max error rate is 25%, set to ".25" # and it will return a match if up to 3 characters are wrong. # # Error rate is hard coded: max out at "6" on the upper and "1" on the lower. # Agrep set to case-insensitive # # Return 0 if no match, otherwise number of matches # #---------------------------------------------------- function agrep(source, search, percent ,errorlimit,results) { # Limit # of errors to 25% of length of str, or no more than 6, whichever is less if(length(search) > 24) errorlimit = 6 else errorlimit = int(length(search) * percent) if(errorlimit < 2) { if(length(search) < 6) errorlimit = 1 else errorlimit = 2 } gsub("\"","\\\"",search) command = AGREP " -i -k -c -" errorlimit " \"" search "\"" print source |& command close(command, "to") command |& getline results close(command) if(results > 0) return results else return 0 } # # Merge an array of strings into a single string. Array indice are numbers. # Credit: GNU awk manual # function join(arr, start, end, sep, result, i) { if(length(arr) == 0) return "" result = arr[start] for (i = start + 1; i <= end; i++) result = result sep arr[i] return result } # # url-encode a string # Credit: Rosetta Stone May 2015 # function urlencode(str, c, len, res, i, ord) { for (i = 0; i <= 255; i++) ord[sprintf("%c", i)] = i len = length(str) res = "" for (i = 1; i <= len; i++) { c = substr(str, i, 1); if (c ~ /[0-9A-Za-z]/) res = res c else res = res "%" sprintf("%02X", ord[c]) } return res } # # Length of an array. Portable function for older versions of gawk # function len2(array, i) { i = 1 while (i in array) { i++ } return i - 1 } # # strip HTML (method has general limitations but OK for this app) # function striphtml(s) { gsub (/<[^>][^>]*>/, "", s) return s } # # Check for file existence. Return 1 if exists, 0 otherwise. # Requires GNU Awk: # @load "filefuncs" # function exists(name ,fd) { if ( stat(name, fd) == -1) return 0 else return 1 } # # Strip leading/trailing whitespace # function strip(str) { gsub(/^[[:space:]]+|[[:space:]]+$/,"",str) return str } # # URL-decode # source: http://rosettacode.org/wiki/URL_decoding#AWK # function urldecode(str, len,i,L,M,R) { len=length(str) for (i=1;i<=len;i++) { if ( substr(str,i,1) == "%") { L = substr(str,1,i-1) # chars to left of "%" M = substr(str,i+1,2) # 2 chars to right of "%" R = substr(str,i+3) # chars to right of "%xx" str = sprintf("%s%c%s",L,hex2dec(M),R) } } return strip(str) } function hex2dec(s, num) { num = index("0123456789ABCDEF",toupper(substr(s,length(s)))) - 1 sub(/.$/,"",s) return num + (length(s) ? 16*hex2dec(s) : 0) }