#!/usr/local/bin/awk -f
@include "init.awk"
@include "library.awk"
BEGIN {
# ################################################################################################
#
# Internet Archive Search (ias). Adapted from iacs (Internet Archive Classic Search)
# Stephen Balbach October 2015
#
# Pass variable:
#
# -v id="/home/adminuser/wi-awb/temp/wi-awb-1026211441/"
# -v id="/home/adminuser/wi-awb/temp/wi-awb-1031200119/"
#
# Optionally pass:
#
# -v cache="delete"
#
# to delete the iascache directory when done (ie. when running via wi.csh)
#
# The purpose of this program is to search each book's metadata from a particular IA search
# and determine if it belongs to the intended author. It will search the first 50 results
# or whatever "Rows" is set to. It returns the number of books that it things match that author
# in the file ias.result -- see talgo.awk for algorithmic processing to determine best search choice.
#
#
# ____________________________________________________________________________________________
#
# Configuration variables:
#
# 1. Program filename - the name of the script. Plus other hard coded paths
MyProg = "ias.awk"
TDir = id
# 5. Number of results per page (50 was the IA default). Can by anything based on
# your computer speed and memory etc..
Rows = 500
#
# 7. URL Agent string - this is what Internet Archive sees in their logs. Recommend
# setting the name of the program and your contact information, such as an email
# or your Internet Archive userid.
Agent = "IAS.AWK: Wikipedia<->IA Project (iacs2015@nym.hush.com)"
#
# Maximum year a book can be published. ie. if a book publication year is beyond this then reject it as garbage data
#
MaxYear = 2030
#
# ##################################################################################################
delete G
main()
if( checkexists(TDir "iascache") && cache ~ /^delete$/ ) # Remove cached files save diskspace
sys2var( Exe["rm"] " -r -- " TDir "iascache")
}
function main( s,c,a,b,k,kname,mname,fulln,ktype,khits,m,n,o,d,count,debug,debugid,dest) {
debug = 0 # 1=on, 0=off
debugid = "tmpO@Alan_Maxwell@t@2" # Specified in this format
WPArticle = clean( readfile(TDir "article.txt") )
delete Result
delete Final
delete Scope
s = readfile(TDir "ia-results") # Don't clean() - retain original
c = split(s, a, "\n")
while (i++ < c) {
if(a[i] != "") { # remove blank lines
split(a[i], b, "|")
if(b[2] != 0) # remove 0 hit results
k[b[3]][b[1]][b[2]] = 1 # name|type|hits
}
}
if(! checkexists(TDir "iascache") )
sys2var( Exe["mkdir"] " " TDir "iascache")
checkexists(TDir "iascache", "ias.awk main()", "exit")
for(kname in k) {
m[kname] = kname
}
for(mname in m) {
#print "mname = " mname >> "bah"
for(kname in k) {
#print "kname = " kname >> "bah"
if( kname ~ "^" mname "$" ) {
#print "MATCHED" >> "bah"
for(ktype in k[kname]) {
#print "ktype = " ktype >> "bah"
for(khits in k[kname][ktype]) {
#print "khits = " khits >> "bah"
if(ktype == "none")
n[4] = "tmpO@" gensub(" ","_","g",kname) "@" ktype "@"
if(ktype == "w")
n[3] = "tmpO@" gensub(" ","_","g",kname) "@" ktype "@"
if(ktype == "t")
n[2] = "tmpO@" gensub(" ","_","g",kname) "@" ktype "@"
if(ktype == "tx")
n[1] = "tmpO@" gensub(" ","_","g",kname) "@" ktype "@"
fulln = kname
}
}
}
}
PROCINFO["sorted_in"] = "@ind_num_asc"
for(o in n) {
# print "n[o] = " n[o]
c = 0
while(c++ < 20) {
if( checkexists(TDir n[o] c) )
break
}
s = readfile(TDir n[o] c) # Don't clean() - retain original
print "________________________________________________________________________________________________________________________________"
print n[o] c
print "________________________________________________________________________________________________________________________________"
split(gensub("[[]http://archive.org/search.php[?]query=","","g",s),a," ")
d = split(mname,b," ")
count = 0
if(b[1] !~ /^Sir$/) { # Skip names that begin with "Sir Whatever" as they produce odd results
if(debug) {
re = n[o] c
if(debugid ~ re)
count = search_ia(a[1],1,"",b[1],b[d],fulln,n[o] c)
}
else {
count = search_ia(a[1],1,"",b[1],b[d],fulln,n[o] c)
}
} else {
count--
Result[n[o] c]["realcount"] = -1
Result[n[o] c]["numfound"] = -1
}
print " |1-----|"
print count
Result[n[o] c]["count"] = count
}
delete n
}
# Create ias.result (list of possibles)
if(checkexists(TDir "ias.result")) {
close(TDir "ias.result")
sys2var( Exe["rm"] " " TDir "ias.result")
}
asorti(Result, dest)
if(length(dest)) {
for(o in dest)
print dest[o] " | " Result[dest[o]]["count"] " | " Result[dest[o]]["realcount"] " | " Result[dest[o]]["numfound"] >> TDir "ias.result"
close(TDir "ias.result")
}
# Create ias.scope (list of metadata)
if(checkexists(TDir "ias.scope"))
sys2var( Exe["rm"] " " TDir "ias.scope")
for(o in Scope) {
for(oo in Scope[o])
print o "|" oo "|" Scope[o][oo]["mediatype"] "|" Scope[o][oo]["date"] "|" Scope[o][oo]["jstor"] >> TDir "ias.scope"
}
close(TDir "ias.scope")
print " |2-----|"
# Run talgo.awk algorithm to find best choice. Load from ias.result and save to ias.out - "-s" save to ia9out-scope
if(length(dest))
sys2var( Exe["talgo"] " -s -l " TDir "ias.result")
print strip( readfile(TDir "ias.result") )
print "Final: " strip( readfile(TDir "ias.out") )
}
function copyarray(result, final, name) {
final["name"] = name
final["count"] = Result[name]["count"]
final["realcount"] = Result[name]["realcount"]
final["numfound"] = Result[name]["numfound"]
}
#
# Search Internet Archive and format results.
#
function search_ia(entity, pagenum, sort, firstn, lastn, fulln, idstring, head,tail,url,xml,c,numfound,doc,page,arr,arrg,arrgh,qin,G,i,count,tokenhits) {
head = "http://archive.org/advancedsearch.php?q="
tail = "&fl[]=date&fl[]=publisher&fl[]=collection&fl[]=creator&fl[]=description&fl[]=downloads&fl[]=identifier&fl[]=mediatype&fl[]=subject&fl[]=title&sort[]=" sort "&sort[]=&sort[]=&rows=" Rows "&output=xml&callback=callback&save=yes&page=" pagenum
url = head entity tail
#print url >> "bah"
# Download XML from Internet Archive
xml = http2var2(url)
if ( xml == "" || length(xml) == 0) {
print "Error in function search_ia: Unable to retrieve data from Internet Archive." > "/dev/stderr"
return -1
}
if(length(xml) < 100) { # Bug in Internet Archive.
print "Warning: XML bug. Using build_xml()"
xml = build_xml(head,tail,entity,Rows,sort,pagenum)
}
gsub(/</,"<",xml);gsub(/>/,">",xml);gsub(/"/,"\"",xml);gsub(/&/,"\\&",xml)
# print xml > "test.xml"
# close("test.xml")
Result[idstring]["realcount"] = 0
c = split(xml, doc, "(|)")
# numFound="432"
match(xml, "numFound=\"[0-9]+\"", arr)
split(arr[0], arrg, "\"")
numfound = arrg[2]
Result[idstring]["numfound"] = numfound
# name="qin">text< (search string as reported by IA)
split(doc[1], arr, "(name=\"qin\">)")
split(arr[2], arrg, "")
qin = arrg[1]
init_searchtokens_full(qin,firstn,lastn)
count = 0
i = 0
while(i < (Rows * 2) ) {
i = i + 2
delete G
# identifier (name="identifier">presidentgarfiel00hinsuoft<)
match(doc[i], "name=\"identifier\">[^<]+<", arr)
split(arr[0],arrg,"(<|>)")
G["identifier"] = arrg[2]
if ( arrg[2] == "" )
continue # Skip if no id
# creator and subject and collection (if multiple names they are surrounded by , remove those and leave each name surrounded by )
ad = 0
ac = split(doc[i], arr, "\n") # possible point of failure if XML format ever stoped having a /n after each line
while(ad++ < ac) {
if(arr[ad] ~ // ) {
gsub(//,"",arr[ad])
gsub(/<\/arr>/,"",arr[ad])
G["creator"] = strip(arr[ad])
continue
}
if(arr[ad] ~ // ) {
gsub(//,"",arr[ad])
gsub(/<\/str>/,"",arr[ad])
G["creator"] = strip(arr[ad])
continue
}
if(arr[ad] ~ // ) {
gsub(//,"",arr[ad])
gsub(/<\/arr>/,"",arr[ad])
G["subject"] = strip(arr[ad])
continue
}
if(arr[ad] ~ // ) {
gsub(//,"",arr[ad])
gsub(/<\/str>/,"",arr[ad])
G["subject"] = strip(arr[ad])
continue
}
if(arr[ad] ~ // ) {
gsub(//,"",arr[ad])
gsub(/<\/arr>/,"",arr[ad])
G["collection"] = strip(arr[ad])
continue
}
if(arr[ad] ~ // ) {
gsub(//,"",arr[ad])
gsub(/<\/str>/,"",arr[ad])
G["collection"] = strip(arr[ad])
continue
}
}
# description (name="description">text<)
split(doc[i], arr, "(")
G["description"] = striphtml(substr(arrg[1], 2))
# date (name="date">1881-01-01T00:00:00Z)
match(doc[i], "name=\"date\">[^<]+<", arr)
split(arr[0],arrg,"(<|>)")
split(arrg[2],arrgh,"-")
G["date"] = strip(arrgh[1])
# mediatype (name="mediatype">texts<)
match(doc[i], "name=\"mediatype\">[^<]+<", arr)
split(arr[0],arrg,"(<|>)")
G["mediatype"] = arrg[2]
# title (name="title">Title<)
match(doc[i], "name=\"title\">[^<]+<", arr)
split(arr[0],arrg,"(<|>)")
G["title"] = arrg[2]
currcount = find_searchtokens_full(G, fulln, idstring)
count = count + currcount
if(currcount > 0) {
Result[idstring]["realcount"]++
}
if(numfound <= Rows) # Abort early - no chance
base = numfound
else
base = Rows
if(count < ( ( (base / 2) * -1) -1) )
return -1001
}
return count
}
function find_searchtokens_full(G, fulln, idstring, token,subentity,count,cc,ci,ca,da,dc,di,dm,re,stopflag,debug,debugid,debugo) {
# Enable debug to examine how it scores individual work IDs. Some output saved to file "test" (overwritten with each run)
debug = 0 # 1=on, 0=off
debugid = "landscapingfordu00giro"
debugo = "/home/pepper/wi-awb/debugo"
if(G["identifier"] ~ debugid && debug) print "Starting" > debugo
dc = split(TokensFull["_date"], da, " ")
count = 0
if( searchwikipedia(G["title"]) ) { # If the book title is in the Wikipedia article..
count++
if(G["identifier"] ~ debugid && debug) {
print " Matched:"
print "+1 in special 1-2 for " subentity >> debugo
}
print " " G["identifier"] " | title | (wikipedia article) | = " G["title"]
}
if( length(G["identifier"]) < 1) G["identifier"] = "unknown"
Scope[idstring][G["identifier"]]["mediatype"] = G["mediatype"]
Scope[idstring][G["identifier"]]["date"] = 0 # 1 if work is matched on a date
Scope[idstring][G["identifier"]]["jstor"] = 0 # 1 if work is matched on creator and work is JSTOR
for(subentity in G) {
if(G["identifier"] ~ debugid && debug)
print "--Starting subentity " subentity
if(subentity ~ /^collection$/ ) { # Skip collection:opensource_audio works
if(G[subentity] ~ /opensource_audio/) {
return -1
}
}
if(subentity ~ /^mediatype$/ ) { # Skip mediatype:software works
if(G[subentity] == "software") {
return -1
}
}
if(subentity ~ /^identifier$/ ) { # Skip arxiv works
if(G[subentity] ~ /arxiv/) {
return -1
}
continue # Skip "identifier" subentity
}
# Remove all "-" (same thing done in init_searchtokens())
G[subentity] = strip(gensub("-"," ","g",G[subentity]))
stopflag = 0
tokenflag = 0
for(token in TokensFull) {
if(TokensFull[token] == "") continue
# Skip the name tokens if 1 has already matched to avoid repeat matches due to regex+agrep
if(tokenflag == 1 && substr(token,1,1) !~ /_/ ) continue
if(token ~ /^_(lastname|firstname|lastname_special)$/) {
if(G["identifier"] ~ debugid && debug)
print "Skipping " token " in " subentity >> debugo
continue
}
agrep_debug = 0
if(G["identifier"] ~ debugid && debug)
agrep_debug = 1
if(token == "_date" && subentity !~ /^creator$|^subject/) { # See below for creator/subject + date check
re = "(" daterange(TokensFull["_birth"]) ").+{1,3}(" daterange(TokensFull["_death"]) ")"
if( match(G[subentity], re) && agrep(G[subentity], TokensFull["_lastname"], ".25", agrep_debug, "plain") ) {
count++
if(G["identifier"] ~ debugid && debug) {
print " Matched:"
print "+1 in _date: G[" subentity "]=|" G[subentity] "| TokensFull[" token "]=|" TokensFull[token] "| lastname=|" TokensFull["_lastname"] "|" >> debugo
}
print " " G["identifier"] " | " subentity " |" TokensFull[token] "| = " G[subentity] " (a)"
Scope[idstring][G["identifier"]]["date"] = 1 # 1 if work is matched on a date
continue
}
}
if(token == "_firstname_special") {
if( agrep(G[subentity], TokensFull["_firstname_special"], ".25", agrep_debug, "plain") && agrep(G[subentity], TokensFull["_lastname_special"], ".25", agrep_debug, "plain") ) {
count++
if(G["identifier"] ~ debugid && debug) {
print " Matched:"
print "+1 in special for " subentity >> debugo
}
print " " G["identifier"] " | " subentity " |" TokensFull["_firstname_special"] " " TokensFull["_lastname_special"] "| = " G[subentity] " (b)"
continue
}
}
# If creator or subject string contains a date, and it doesn't match one of the birth-death dates in TokensFull, then don't make a match.
if(subentity ~ /^creator$|^subject/ && token != "_birth" && token != "_death") {
if(G["identifier"] ~ debugid && debug && stopflag == 0) {
print " " subentity ": " G[subentity]
stopflag = 1
#for(z in TokensFull)
#print z " = " TokensFull[z]
#print "subentity = " subentity
#print "token = " token
}
cc = split(G[subentity], ca, /|<\/str>/) # when multi-names are separated by /<\/str>
G[subentity] ~ // ? caindex = 2 : caindex = 1
if(G["identifier"] ~ debugid && debug)
print "caindex(1) = " ca[caindex]
if(ca[caindex] ~ /;/) { # when multi-names are separated by ; ..this may break in certain cases
cc = split(ca[caindex], ca, /;/)
} else if(ca[caindex] ~ /[ ]and[ ]/) { # when multi-names are separated by / and / eg. archive.org/details/UsgsBulletin507MiningDistrictsOfTheWesternUnitedStates
cc = split(ca[caindex], ca, /[ ]and[ ]/)
} else if(G["description"] ~ /by user tpb/ && ca[caindex] !~ /[0-9]{4}/ && G[subentity] !~ // && length(ca[caindex]) > length(fulln) + 15 ) { # when multi-names sep by , ("uploaded by user tpb")
cc = split(ca[caindex], ca, /,/)
}
#if(G["identifier"] ~ debugid && debug)
#print "caci(-2) = " ca[1]
ci = 0
while(ci++ < cc) {
if(ca[ci] == "") continue
ca[ci] = strip(ca[ci])
#if(G["identifier"] ~ debugid && debug)
#print "caci(-1) = " ca[ci]
gsub(/[[]/,"",ca[ci])
gsub(/[]]/,"",ca[ci])
#if(G["identifier"] ~ debugid && debug)
#print "caci(0) = " ca[ci]
re = "[.|,|;|-]{0,1}[ ]{0,1}from old catalog.*$" # remove trailing "[from old catalog]" and variations
if(ca[ci] ~ re)
ca[ci] = rmtrail( gensub(re, "", "g", ca[ci]) )
#if(G["identifier"] ~ debugid && debug)
#print "caci(0.1) = " ca[ci]
if(token != "_date") { # leave date in if checking for date otherwise remove it
re = "[Bb][.][ ]{0,1}" "(" daterange(TokensFull["_birth"]) ")$" # remove trailing "b. 1860"
if(ca[ci] ~ re) {
ca[ci] = rmtrail( gensub(re, "", "g", ca[ci]) )
}
#if(G["identifier"] ~ debugid && debug)
#print "caci(1) = " ca[ci]
re = "(" daterange(TokensFull["_birth"]) ").+{1,3}(" daterange(TokensFull["_death"]) ")"
ca[ci] = rmtrail( gensub( re, "", "g", ca[ci]) ) # remove date
#if(G["identifier"] ~ debugid && debug)
#print "caci(2) = " ca[ci]
ca[ci] = rmtrail( gensub(/[(][^\)]*[)]$/, "", "g", ca[ci]) ) # remove final parenthesis content eg. Loudon, W. J. (William James)
#if(G["identifier"] ~ debugid && debug)
#print "caci(3) = " ca[ci]
re = "(" daterange(TokensFull["_birth"]) ")[-]{0,1}$" # remove trailing "1860-" ie. when no death date is given
if(ca[ci] ~ re) {
ca[ci] = rmtrail( gensub(re, "", "g", ca[ci]) )
#if(G["identifier"] ~ debugid && debug) {
#print "re = " re
#print "caci(4) = " ca[ci]
#}
}
ca[ci] = tolower(ca[ci])
ca[ci] = rmtrail( gensub("sir$", "", "g", ca[ci]) ) # remove trailing "Sir"
ca[ci] = strip( gensub("^sir ", "", "g", ca[ci]) ) # remove leading "Sir "
ca[ci] = strip( gensub(/annotator|translator/, "", "g", ca[ci]) ) # remove various
}
if(token != "_date") {
re = "^" regesc2(tolower(TokensFull[token])) "$" # eg. "^william t[.] volk$"
gsub("-","",re) # Is this needed?
}
else {
# eg, "1860.+{1}1910" which is like 1860?1910 - will match "#-#", "# - #", "# #", etc.. also catch "b. ####$" and "####-$"
re = "(" daterange(TokensFull["_birth"]) ").+{1,3}(" daterange(TokensFull["_death"]) ")|([Bb][.][ ]{0,1}(" daterange(TokensFull["_birth"]) ")$)|((" daterange(TokensFull["_birth"]) ")[-]{0,1}$)"
}
agrep_debug = 0
if(G["identifier"] ~ debugid && debug) {
print " Step 1 (" token "): " tolower(ca[ci]) " | " re
agrep_debug = 1
}
if(token != "_date" && agrep(ca[ci], re, ".25", agrep_debug, "regex", length(TokensFull[token]) ) ) {
dm = 1
if(subentity ~ /^creator$/ && tolower(G["identifier"]) ~ /jstor/)
Scope[idstring][G["identifier"]]["jstor"] = 1 # 1 if creator field matches and JSTOR
}
# Check for lastname: ^roy|[ ]roy|[(]roy
re2 = "^" regesc2(tolower(TokensFull["_lastname"])) "|[ ]{1,}" regesc2(tolower(TokensFull["_lastname"])) "|[(]" regesc2(tolower(TokensFull["_lastname"]))
if(token == "_date" && match(ca[ci], re) && agrep(ca[ci], re2, ".25", agrep_debug, "regex", length(TokensFull["_lastname"]) ) ) { # match date + lastname
dm = 1
Scope[idstring][G["identifier"]]["date"] = 1 # 1 if work is matched on a date
}
if(dm == 1) {
if(G["identifier"] ~ debugid && debug) {
print " Matched:"
print " +1 for " token " in " subentity >> debugo
}
print " " G["identifier"] " | " subentity " |" TokensFull[token] "| = " ca[ci] " (c)"
count++
dm = 0
tokenflag = 1
}
}
continue
}
agrep_debug = 0
if(G["identifier"] ~ debugid && debug)
agrep_debug = 1
agrep_type = "regex"
agrep_weight = "0.25"
if(subentity ~ /^description$|^title$/) {
if(token !~ /_date|_birth|_death/ ) {
if(TokensFull[token] ~ /[.]/) {
re = tolower(TokensFull[token])
agrep_type = "plain"
agrep_weight = "0.10"
}
else
re = "^" regesc2(tolower(TokensFull[token])) "|[ ]{1,}" regesc2(tolower(TokensFull[token])) "|[(]" regesc2(tolower(TokensFull[token]))
}
else
re = regesc2(tolower(TokensFull[token]))
}
else {
re = regesc2(tolower(TokensFull[token]))
}
if( agrep(tolower(G[subentity]), re, agrep_weight, agrep_debug, agrep_type, length(TokensFull[token]) ) && token != "_birth" && token != "_death" ) {
count++
if(G["identifier"] ~ debugid && debug) {
print " Matched:"
print "+1 for " token " (" TokensFull[token] ") in " subentity ": " G[subentity] >> debugo
}
print " " G["identifier"] " | " subentity " |" TokensFull[token] "| = " G[subentity] " (d)"
tokenflag = 1
}
}
}
if(datehit(G,TokensFull["_birth"],count) == 0) # This must be the last modifier to count
count--
if(G["identifier"] ~ debugid && debug)
close(debugo)
print G["identifier"] " | " count
if(count > 0) return 1
return -1
}
# Return 1 if book year is within the range author's year of birth + 20 -> year of death
function datehit(G,birth,count) {
# Don't penalize audio/video collections which don't have historic dates
if(G["mediatype"] !~ "texts")
return 1
# Don't penalize Project Gutenberg which have no dates
if(tolower(G["description"]) ~ /project gutenberg/)
return 1
if(birth < 1830 ) # Don't penalize old authors since most of their books on IA will be later-period reprints
return 1
if(G["date"] == "" && count == 1) { # Scrape date only if count is 1 - anything more/less won't matter. This should speed up processing.
G["date"] = scrapedate(G["identifier"])
}
print "\nDATE = " G["date"]
if(G["date"] == "")
return 0
if(G["date"] >= TokensFull["_birth"] + 20 && G["date"] <= TokensFull["_death"])
return 1
return 0
}
function init_searchtokens_full(qin, firstn, lastn ,a,c,d,e,f,h,j,k,head,dm,safe,aoc,aoa,aoaa,aoi,andorstr) {
delete TokensFull
TokensFull["_lastname"] = lastn
TokensFull["_firstname"] = firstn
# Populate _first/lastname_special when a unusual search string. See test.awk specialtest() for checking test cases.
# Note: these are specific and need to be modified if search string format (qin) ever changes ie. in iaa.lua
aoc = patsplit(qin, aoa, /AND|OR/)
while(aoi++ < aoc)
andorstr = andorstr " " aoa[aoi]
andorstr = strip(andorstr)
if(andorstr ~ /AND OR OR AND OR|AND OR AND|AND OR OR AND|AND OR AND OR/ ) {
split(qin,aoa,/AND|OR/)
gsub(/[)]|[(]|["]/,"",aoa[2])
aoi = split(aoa[2],aoaa," ")
TokensFull["_firstname_special"] = strip(aoaa[1])
TokensFull["_lastname_special"] = strip(aoaa[aoi])
}
else if(andorstr ~ "AND OR AND OR OR AND") {
split(qin,aoa,/AND|OR/)
gsub(/[)]|[(]|["]/,"",aoa[2])
gsub(/[)]|[(]|["]/,"",aoa[4])
TokensFull["_firstname_special"] = strip(aoaa[2])
TokensFull["_lastname_special"] = strip(aoaa[4])
}
# AND OR OR AND OR
# 5+word extended ascii
# example: (-mediatype:software) AND (((Claude Prosper Jolyot de Crébillon) OR (Claude Prosper Jolyot de Cr*billon)) OR ("1707-1777" AND ("Crébillon" OR "Crebillon")))
# AND OR OR AND
# 5+word extended ascii
# example: (-mediatype:software) AND ((François Christophe Edmond de Kellermann OR (Fran*ois Christophe Edmond de Kellermann)) OR ("1802-1868" AND "Kellermann"))
# AND OR AND OR
# 5+word extended ascii
# example: (-mediatype:software) AND ((Claude Prosper Jolyot de Crébillon) OR ("1707-1777" AND ("Crébillon" OR "Crebillon")))
# AND OR AND
# 5+word extended ascii
# example: (-mediatype:software) AND ((François Christophe Edmond de Kellermann) OR ("1802-1868" AND "Kellermann"))
# AND OR AND OR OR AND
# First char extended ascii special search
# example: (-mediatype:software) AND ((("Étienne" OR "Etienne") AND ("Aignan" OR "Aignan")) OR ("1773-1824" AND "Aignan"))
c = split(qin, a, "\"")
offset = 0
if(length(a[1]) > 0)
offset = 1
i = 0 + offset
while(i++ < c) {
if(a[i] ~ /^[0-9]{4}-[0-9]{4}$/) {
TokensFull["_date"] = a[i]
i = i + offset
split(TokensFull["_date"], h, "-")
TokensFull["_birth"] = h[1]
TokensFull["_death"] = h[2]
continue
}
# Filter out 1-word names (unless name really is 1 word)
if(firstn !~ lastn) {
d = split(a[i],e," ")
if(d == 1) {
i = i + offset
continue
}
}
# Filter out when last character is "." eg. "Jéquier, G."
if( substr(a[i], length(a[i]), 1) ~ /[.]/ ) {
i = i + offset
continue
}
# Filter out duplicates caused by extended ascii eg. Gustave Jéquier / Gustave Jequier (note the é/e)
# because agrep will handle these cases in find_searchtokens(). This will break badly if search syntax ever changes.
if(a[i - 1] ~ /[:]$/)
TokensFull[a[i]] = a[i]
i = i + offset
}
# Remove all "-" (same thing done in find_searchtokens())
for(head in TokensFull)
TokensFull[head] = strip(gensub("-"," ","g",TokensFull[head]))
print qin
print " |5-----|"
for(head in TokensFull)
print head " = " TokensFull[head]
print " |6-----|"
}
#
# highlight search tokens
#
function hightokens(str ,c,arr,i,t,out,build,work) {
c = split(str, arr, " ")
while(i < c) {
i++
out = ""
work = arr[i]
gsub("([,]|\"|[.]$|\"[.]$|'[.]$|`[.]$|;|[:]|^'|'$|^`|`$|'s$|'s[.]$)","",work) # <-- Customize search token exceptions
for ( t in Tokens ) {
if ( tolower(t) == tolower(work) ) {
out = "" arr[i] ""
break
}
}
out == "" ? out = arr[i] : ""
i == 1 ? build = out : build = build " " out
}
return build
}
function init_searchtokens(iaquery ,arr, arrg, i, k, c) {
delete Tokens
gsub("\"","",iaquery)
gsub("+"," ",iaquery)
split(iaquery, arr, "+")
k = join(arr, 1, len2(arr), " ")
c = split(k, arrg, " ")
while ( i++ < c) {
if ( arrg[i] !~ "(AND|OR|:)" ) { # <-- Customize tokens to ignore eg. logic statements, etc..
gsub(/[,]+$/,"",arrg[i])
gsub(/[)]+$/,"",arrg[i])
gsub(/^[(]+/,"",arrg[i])
Tokens[arrg[i]] = arrg[i]
}
}
}
#
# If date of work is empty, it may be due to bug at Internet Archive caused when date field contains non-ascii characters it returns empty date.
# See examples paramountmethodf00fode ("c1922") and greeceallies191400abbouoft ("[1922]")
# This function scrapes the main work page searching for "stated date is 1922" in copyright-notice, or failing that looking at the meta.xml file.
#
function scrapedate(wid, page,a,b,c,d,e,metaxml,debug,debugid,debugo,widu,fname) {
# Enable debug to examine how it handles individual work IDs.
debug = 0 # 1=on, 0=off
debugid = "visionbowditchhp00bowdrich"
debugo = "/home/adminuser/wi-awb/testpage"
widu = gensub(/ /,"_","g",wid) # Convert " " to "_". Should never happen but in case.
# Read from local cache, if not exist load from web and save to cache
fname = TDir "iascache/" widu
if( checkexists( fname ) ) {
page = readfile( fname )
}
else {
page = http2var2("http://archive.org/details/" widu)
print page > fname
close( fname )
}
if( match(page, "stated date is [0-9]{4}", a) ) { # If it has a "stated date is XXXX" on the main work page, use that.
if(wid ~ debugid && debug)
print " Step 0: " a[0]
return strip( substr(a[0], length(a[0]) - 4, length(a[0]) ) )
}
if(wid ~ debugid && debug) {
print page > debugo
close(debugo)
}
# Otherwise, get the meta.xml file and see if it has a pair
# Read from local cache, if not exist load from web and save to cache
fname = TDir "iascache/" widu "_meta.xml"
if( checkexists( fname ) ) {
metaxml = readfile( fname )
}
else {
metaxml = http2var2("http://archive.org/download/" widu "/" widu "_meta.xml")
print metaxml > fname
close( fname )
}
match(metaxml, /[<]collection[>][^<]+[<][/]collection[>]/, e)
if(tolower(e[0]) ~ /citebank/) { # Special case for citebank
if( match(metaxml, /[<]volume[>][^<]+[<][/]volume[>]/, f) ) {
if(wid ~ debugid && debug)
print " Step 2: " f[0]
if( match(f[0], /[0-9]{4}/, d) ) {
return strip(d[0])
}
} else {
if( match(metaxml, /[<]year[>][^<]+[<][/]year[>]/, c) ) {
if(wid ~ debugid && debug)
print " Step 2: " c[0]
if( match(c[0], /[0-9]{4}/, d) ) {
return strip(d[0])
}
}
}
}
if( match(metaxml, /[<]date[>][^<]+[<][/]date[>]/, c) ) { # Check for
if(wid ~ debugid && debug)
print " Step 2: " c[0]
if( match(c[0], /[0-9]{4}/, d) ) {
return strip(d[0])
}
}
# If still nothing, check for
if( match(metaxml, /[<]year[>][^<]+[<][/]year[>]/, c) ) {
if(wid ~ debugid && debug)
print " Step 2: " c[0]
if( match(c[0], /[0-9]{4}/, d) ) {
return strip(d[0])
}
}
# If still nothing, check if a date is in publisher field eg. see imperialgazette00meyegoog
if( match(metaxml, /[<]publisher[>][^<]+[<][/]publisher[>]/, c) ) {
if(wid ~ debugid && debug)
print " Step 2: " c[0]
if( match(c[0], /[0-9]{4}/, d) ) {
if(strip(d[0]) > 1200 && strip(d[0]) < MaxYear ) {
return strip(d[0])
}
}
}
if(wid ~ debugid && debug)
print " Step 3: none found"
return ""
}
#
# Build a passable XML file due to bug in Internet Archive (in some cases) triggered by unknown reasons.
# 1. create a fake XML header
# 2. download list of identifier's in CSV format (this works for some reason but XML doesn't)
# 3. for each identifier, request *individual* XML (one for each work) and extract the portion between
# 4. build a complete XML from the parts
#
function build_xml(head,tail,entity,rows,sort,pagenum, csv,out,subxml,sc,doc,a,i,k) {
qin = urlendecode(entity,"decode")
# XML header
out = "" "\n"
out = out "" "\n"
out = out "" "\n"
out = out "0" "\n"
out = out "89" "\n"
out = out "" "\n"
out = out "" "\n"
out = out "xml" "\n"
out = out "" rows "" "\n"
out = out "" qin "" "\n"
out = out "collection,creator,date,description,downloads,identifier,mediatype,publisher,title,year" "\n"
out = out "0" "\n"
out = out "" "\n"
out = out "" "\n"
out = out "" "\n"
# Get CSV version (only content type that works)
tail = "&fl[]=identifier&rows=" rows "&output=csv&callback=callback&save=yes&page=" pagenum
url = head entity tail
csv = http2var2(url)
tail = "&fl[]=date&fl[]=publisher&fl[]=collection&fl[]=creator&fl[]=description&fl[]=downloads&fl[]=identifier&fl[]=mediatype&fl[]=subject&fl[]=title&sort[]=" sort "&sort[]=&sort[]=&rows=1&output=xml&callback=callback&save=yes&page=" pagenum
c = split(csv, a, "\n")
while(i++ < c) {
if(i == 1) continue # Skip CSV header
gsub(/"/,"",a[i]) # Remove quotes around CSV field
entity = "identifier%3A%22" a[i] "%22"
url = head entity tail
subxml = http2var2(url)
gsub(/</,"<",subxml);gsub(/>/,">",subxml);gsub(/"/,"\"",subxml);gsub(/&/,"\\&",subxml)
sc = split(subxml, doc, "(|)")
if(length(doc[2]) > 10) {
out = out "" "\n"
out = out doc[2]
out = out "" "\n"
}
}
out = out "" "\n"
out = out "" "\n"
gsub(/numFound="15"/,"numFound=\"" c - 1 "\"",out)
return out
}
#
# Search Wikipedia article for book title. Return true if match found.
# False positives may arise here if the book title is short or simple phrase
#
function searchwikipedia(booktitle) {
if( agrep( WPArticle, extracttitle(booktitle), ".25") )
return 1
return 0
}
#
# Srtip a book title (from IA) down to its bare essential (rm subtitle etc) so that it can be searched for in the WP article.
#
function extracttitle(tit, c,l,f,h,k,o,p) {
delete p
# Convert any XML codes, or ";"
gsub(/>/,">",tit)
gsub(/"/,"\"",tit)
gsub(/&/,"\\&",tit)
gsub(";",":",tit)
# Rm all the words which follow one of these characters
split(tit, h, "(—| - |[(]|:)")
tit = h[1]
# Rm special cases
gsub(", in [Ff]our [Pp]arts.*","",tit)
gsub("in [Ff]our [Pp]arts.*","",tit)
gsub(", in [Tt]hree [Pp]arts.*","",tit)
gsub("in [Tt]hree [Pp]arts.*","",tit)
gsub(", in [Tt]wo [Pp]arts.*","",tit)
gsub("in [Tt]wo [Pp]arts.*","",tit)
gsub(/[Mm]lle[.]/, "Mademoiselle",tit)
# Rm words following "." except in certain cases
l = f = k = 0
k = split(tit,o," ")
while(l < k) {
l++
if(o[l] ~ /[.]/) { # Bypass these allowed words
if(o[l] ~ /Mrs[.]|Mr[.]|[A-Z][.]|Inc[.]|Dr[.]|Capt[.]|doma[.]|St[.]|No[.]/) {
f++
p[f] = o[l]
} else { # Keep first occurance of non-allowed "word." and drop the remaining words
f++
p[f] = o[l]
p[f] = substr(p[f],0,length(p[f]) - 1) # Rm trailing "."
break
}
} else {
f++
p[f] = o[l]
}
}
tit = join(p, 1, length(p), " ")
# Rm words following "," except if first word (eg. "Sidonia, the Sorceress")
f = l = k = 0
delete p
k = split(tit,o," ")
while(l < k) {
l++
if(o[l] ~ /[,]/) { # Keep if first word
if(f == 0) {
f++
p[f] = o[l]
} else {
f++
p[f] = o[l]
p[f] = substr(p[f],0,length(p[f]) - 1)
break
}
} else {
f++
p[f] = o[l]
}
}
tit = join(p, 1, length(p), " ")
# Rm last word if "by"
delete p
f = 0
c = split(tit, o, " ")
if(strip(o[c]) ~ "[Bb]y") {
while(f < c - 1) {
f++
p[f] = o[f]
}
tit = join(p, 1, length(p), " ")
}
# Rm second to last word if "by"
delete p
f = 0
c = split(tit, o, " ")
if(strip(o[c - 1]) ~ "[Bb]y") {
while(f < c - 2) {
f++
p[f] = o[f]
}
tit = join(p, 1, length(p), " ")
}
return strip(tit)
}
#
# Build a RE with a date range
#
function daterange(givendate, re) {
if(givendate)
re = givendate -3 "|" givendate -2 "|" givendate -1 "|" givendate "|" givendate + 1 "|" givendate + 2 "|" givendate + 3
else
re = "a1b2c3d4e5f6g7h8i9j0" # nonsense string if no date ie. no match will be made
return re
}
# _________________________ utilities _______________________________________________________
# See library.awk for others
#
# http2var - replicate "wget -q -O- http://..." in pure gawk
# Return the HTML page as a string.
#
function http2var2(url)
{
return clean( sys2var( Exe["wget"] " --user-agent=\"" Agent "\" -q -O- \"" url "\"") )
}
#
# Remove certain trailing characters from a string
#
function rmtrail(str) {
str = strip(str)
if( substr( str, length(str) - 2, length(str) ) ~ /[(][)]/ ) # remove trailing ()
str = substr( str, 1, length(str) - 2 )
if( substr( str, length(str) - 1, length(str) ) ~ /,|;/ ) # remove trailing , or ;
str = substr( str, 1, length(str) - 1 )
return strip(str)
}
#----------------------------------------------------
# Approximate (fuzzy) matching using agrep
#
# source = source text
# search = text to search for in source
# percent = maximum error rate percentage of search.
# ie. if source is 12 characters and max error rate is 25%, set to ".25"
# and it will return a match if up to 3 characters are wrong.
# debug = if "1", print debug statement.
# stype = search string is "regex" or "plain" text. Or "exact" for exact match (case-insensitive)
# rlength = optional. Length of string without regex characters. Use if using "regex".
#
# Error rate is hard coded: max out at "6" on the upper and "1" on the lower.
# Agrep set to case-insensitive
#
# Return 0 if no match, otherwise number of matches
#
#----------------------------------------------------
function agrep(source, search, percent, debug, stype, rlength, slength,errorlimit,results,s,command)
{
if(stype == "")
stype = "plain"
slength = length(search)
if(rlength == "" || rlength == 0)
rlength = slength
if(stype == "regex")
slength = rlength
# Limit # of errors to 25% of length of str, or no more than 6, whichever is less
if(slength > 24)
errorlimit = 6
else
errorlimit = int(slength * percent)
if(errorlimit < 2) {
if(slength < 6)
errorlimit = 1
else
errorlimit = 2
}
gsub("\"","\\\"",search) # Escape any " marks
if( substr(search,slength,1) ~ /\\/ ) { # If last character is " ..
s = substr(search, 1, slength - 1)
search = s
}
if(stype == "regex")
command = Exe["agrep"] " -i -c -" errorlimit " -- \"" agrepstrip( strip(search) ) "\""
else if(regex == "exact")
command = Exe["agrep"] " -i -k -c -0 -- \"" agrepstrip( strip(search) ) "\""
else if(stype == "plain")
command = Exe["agrep"] " -i -k -c -" errorlimit " -- \"" agrepstrip( strip(search) ) "\""
else
return 0
if(debug)
print "Agrep command = " command
print agrepstrip(source) |& command
close(command, "to")
command |& getline results
close(command)
if(results > 0)
return results
else
return 0
}
#
# Length of an array. Portable function for older versions of gawk
#
function len2(array, i) {
i = 1
while (i in array) {
i++
}
return i - 1
}
#
# strip HTML (method has general limitations but OK for this app)
#
function striphtml(s) {
gsub (/<[^>][^>]*>/, "", s)
return s
}
#
# Remove problem shell characters when running agrep
#
function agrepstrip(str) {
return gensub(/[`]/, "", "g", str)
}
#
# Count number of occurances of word in str .. presumes space separated
#
function occurances(str, word, count,c,a,i) {
count = 0
c = split(str,a," ")
while(i++ < c) {
if(a[i] == word)
count++
}
return count
}
#
# Escape regex symbols
#
function regesc2(str, safe) {
safe = str
gsub(/[][^$*?+{}\\()|]/, "[&]", safe)
gsub("[\\^]","\\^",safe) # replace "[^]" with "[\^]"
gsub("[.]","[&]{0,1}",safe) # replace "." with "[.]{0.1}"
return safe
}