#!/usr/local/bin/awk -f
@include "init.awk"
@include "library.awk"
BEGIN {
# ################################################################################################
#
# Internet Archive Search (ias). Adapted from iacs (Internet Archive Classic Search)
#  Stephen Balbach October 2015
#
#  Pass variable: 
#
#	-v id="/home/adminuser/wi-awb/temp/wi-awb-1026211441/"
#       -v id="/home/adminuser/wi-awb/temp/wi-awb-1031200119/"
#
#  Optionally pass:
#
#       -v cache="delete"
#
#    to delete the iascache directory when done (ie. when running via wi.csh)
#
#  The purpose of this program is to search each book's metadata from a particular IA search
#  and determine if it belongs to the intended author. It will search the first 50 results
#  or whatever "Rows" is set to. It returns the number of books that it things match that author
#  in the file ias.result -- see talgo.awk for algorithmic processing to determine best search choice. 
#
#
#   ____________________________________________________________________________________________
#
# Configuration variables:
#
#	1. Program filename - the name of the script. Plus other hard coded paths

MyProg = "ias.awk"
TDir = id

#       5. Number of results per page (50 was the IA default). Can by anything based on
#	   your computer speed and memory etc..

Rows = 500

#
#	7. URL Agent string - this is what Internet Archive sees in their logs. Recommend
#	   setting the name of the program and your contact information, such as an email
#	   or your Internet Archive userid.

Agent = "IAS.AWK: Wikipedia<->IA Project (iacs2015@nym.hush.com)"

#
# Maximum year a book can be published. ie. if a book publication year is beyond this then reject it as garbage data
#

MaxYear = 2030

#
# ##################################################################################################

        delete G
        main()

        if( checkexists(TDir "iascache") && cache ~ /^delete$/ )   # Remove cached files save diskspace
          sys2var( Exe["rm"] " -r -- " TDir "iascache")
}

function main(	s,c,a,b,k,kname,mname,fulln,ktype,khits,m,n,o,d,count,debug,debugid,dest) {

       debug = 0     # 1=on, 0=off
       debugid = "tmpO@Alan_Maxwell@t@2"   # Specified in this format

       WPArticle = clean( readfile(TDir "article.txt") )

       delete Result
       delete Final
       delete Scope

       s = readfile(TDir "ia-results") # Don't clean() - retain original
       c = split(s, a, "\n")
       while (i++ < c) {
         if(a[i] != "") { # remove blank lines
           split(a[i], b, "|")
           if(b[2] != 0)  # remove 0 hit results
             k[b[3]][b[1]][b[2]] = 1 # name|type|hits
         }
       }

       if(! checkexists(TDir "iascache") ) 
         sys2var( Exe["mkdir"] " " TDir "iascache")

       checkexists(TDir "iascache", "ias.awk main()", "exit") 

       for(kname in k) {
         m[kname] = kname       
       }

       for(mname in m) {
#print "mname = " mname >> "bah"
         for(kname in k) {
#print "kname = " kname >> "bah"
           if( kname ~ "^" mname "$" ) {
#print "MATCHED" >> "bah"
             for(ktype in k[kname]) {
#print "ktype = " ktype >> "bah"
               for(khits in k[kname][ktype]) {
#print "khits = " khits >> "bah"
                 if(ktype == "none") 
                   n[4] = "tmpO@" gensub(" ","_","g",kname) "@" ktype "@" 
                 if(ktype == "w") 
                   n[3] = "tmpO@" gensub(" ","_","g",kname) "@" ktype "@" 
                 if(ktype == "t") 
                   n[2] = "tmpO@" gensub(" ","_","g",kname) "@" ktype "@" 
                 if(ktype == "tx") 
                   n[1] = "tmpO@" gensub(" ","_","g",kname) "@" ktype "@" 
                 fulln = kname
               }
             }
           }
         }
         PROCINFO["sorted_in"] = "@ind_num_asc"
         for(o in n) {
# print "n[o] = " n[o]
           c = 0
           while(c++ < 20) {
             if( checkexists(TDir n[o] c) ) 
               break
           }       
           s = readfile(TDir n[o] c) # Don't clean() - retain original 
           print "________________________________________________________________________________________________________________________________"
           print n[o] c
           print "________________________________________________________________________________________________________________________________"
           split(gensub("[[]http://archive.org/search.php[?]query=","","g",s),a," ")
           d = split(mname,b," ")
           count = 0
           if(b[1] !~ /^Sir$/)  {                            # Skip names that begin with "Sir Whatever" as they produce odd results
               if(debug) {
                 re =  n[o] c
                 if(debugid ~ re) 
                   count = search_ia(a[1],1,"",b[1],b[d],fulln,n[o] c)
               }
               else {
                 count = search_ia(a[1],1,"",b[1],b[d],fulln,n[o] c)
               }
           } else  {
               count--
               Result[n[o] c]["realcount"] = -1 
               Result[n[o] c]["numfound"] = -1
           }
           print " |1-----|"
           print count
           Result[n[o] c]["count"] = count
         }
         delete n
       }

       # Create ias.result (list of possibles)
       if(checkexists(TDir "ias.result")) {
         close(TDir "ias.result")
         sys2var( Exe["rm"] " " TDir "ias.result")
       }
       asorti(Result, dest)
       if(length(dest)) {
         for(o in dest) 
           print dest[o] " | " Result[dest[o]]["count"] " | " Result[dest[o]]["realcount"] " | " Result[dest[o]]["numfound"] >> TDir "ias.result"
         close(TDir "ias.result")
       }

       # Create ias.scope (list of metadata)
       if(checkexists(TDir "ias.scope"))
         sys2var( Exe["rm"] " " TDir "ias.scope")
       for(o in Scope) {
         for(oo in Scope[o])
           print o "|" oo "|" Scope[o][oo]["mediatype"] "|" Scope[o][oo]["date"] "|" Scope[o][oo]["jstor"] >> TDir "ias.scope"
       }
       close(TDir "ias.scope")

       print " |2-----|"

       # Run talgo.awk algorithm to find best choice. Load from ias.result and save to ias.out - "-s" save to ia9out-scope
       if(length(dest)) 
         sys2var( Exe["talgo"] " -s -l " TDir "ias.result")

       print strip( readfile(TDir "ias.result") ) 
       print "Final: " strip( readfile(TDir "ias.out") ) 

}

function copyarray(result, final, name) {
            final["name"] = name
            final["count"] = Result[name]["count"]              
            final["realcount"] = Result[name]["realcount"]                 
            final["numfound"] = Result[name]["numfound"]
}

#
# Search Internet Archive and format results.
#
function search_ia(entity, pagenum, sort, firstn, lastn, fulln, idstring,     head,tail,url,xml,c,numfound,doc,page,arr,arrg,arrgh,qin,G,i,count,tokenhits) {

        head = "http://archive.org/advancedsearch.php?q="
        tail = "&fl[]=date&fl[]=publisher&fl[]=collection&fl[]=creator&fl[]=description&fl[]=downloads&fl[]=identifier&fl[]=mediatype&fl[]=subject&fl[]=title&sort[]=" sort "&sort[]=&sort[]=&rows=" Rows "&output=xml&callback=callback&save=yes&page=" pagenum
        url = head entity tail

#print url >> "bah"

       # Download XML from Internet Archive
	xml = http2var2(url) 
        if ( xml == "" || length(xml) == 0) {
          print "Error in function search_ia: Unable to retrieve data from Internet Archive." > "/dev/stderr"
          return -1
        }

        if(length(xml) < 100) {  # Bug in Internet Archive. 
          print "Warning: XML bug. Using build_xml()"
          xml = build_xml(head,tail,entity,Rows,sort,pagenum)
        }
        gsub(/&lt;/,"<",xml);gsub(/&gt;/,">",xml);gsub(/&quot;/,"\"",xml);gsub(/&amp;/,"\\&",xml)

#        print xml > "test.xml"
#        close("test.xml")

        Result[idstring]["realcount"] = 0

        c = split(xml, doc, "(<doc>|</doc>)")

       # numFound="432"
        match(xml, "numFound=\"[0-9]+\"", arr)
        split(arr[0], arrg, "\"")
        numfound = arrg[2]
        Result[idstring]["numfound"] = numfound

     # name="qin">text<    (search string as reported by IA)
        split(doc[1], arr, "(name=\"qin\">)")
        split(arr[2], arrg, "</str>")
        qin = arrg[1]

	init_searchtokens_full(qin,firstn,lastn)

        count = 0
        i = 0
        while(i < (Rows * 2) ) {

            i = i + 2

            delete G

       # identifier (name="identifier">presidentgarfiel00hinsuoft<)
            match(doc[i], "name=\"identifier\">[^<]+<", arr)
            split(arr[0],arrg,"(<|>)")
            G["identifier"] = arrg[2]
            if ( arrg[2] == "" )
              continue # Skip if no id

       # creator and subject and collection (if multiple names they are surrounded by <arr .. </arr>, remove those and leave each name surrounded by <str></str>)

            ad = 0
            ac = split(doc[i], arr, "\n") # possible point of failure if XML format ever stoped having a /n after each line
            while(ad++ < ac) {
              if(arr[ad] ~ /<arr name="creator">/ ) {
                gsub(/<arr name="creator">/,"",arr[ad])
                gsub(/<\/arr>/,"",arr[ad])
                G["creator"] = strip(arr[ad])
                continue
              }
              if(arr[ad] ~ /<str name="creator">/ ) {
                gsub(/<str name="creator">/,"",arr[ad])
                gsub(/<\/str>/,"",arr[ad])
                G["creator"] = strip(arr[ad])
                continue
              }
              if(arr[ad] ~ /<arr name="subject">/ ) {
                gsub(/<arr name="subject">/,"",arr[ad])
                gsub(/<\/arr>/,"",arr[ad])
                G["subject"] = strip(arr[ad])
                continue
              }
              if(arr[ad] ~ /<str name="subject">/ ) {
                gsub(/<str name="subject">/,"",arr[ad])
                gsub(/<\/str>/,"",arr[ad])
                G["subject"] = strip(arr[ad])
                continue
              }
              if(arr[ad] ~ /<arr name="collection">/ ) {
                gsub(/<arr name="collection">/,"",arr[ad])
                gsub(/<\/arr>/,"",arr[ad])
                G["collection"] = strip(arr[ad])
                continue
              }
              if(arr[ad] ~ /<str name="collection">/ ) {
                gsub(/<str name="collection">/,"",arr[ad])
                gsub(/<\/str>/,"",arr[ad])
                G["collection"] = strip(arr[ad])
                continue
              }
            }

       # description (name="description">text<)
            split(doc[i], arr, "(<str name=\"description\")")
            split(arr[2], arrg, "</str>")
            G["description"] = striphtml(substr(arrg[1], 2))

       # date (name="date">1881-01-01T00:00:00Z)
            match(doc[i], "name=\"date\">[^<]+<", arr)
            split(arr[0],arrg,"(<|>)")
            split(arrg[2],arrgh,"-")
            G["date"] = strip(arrgh[1])

       # mediatype (name="mediatype">texts<)
            match(doc[i], "name=\"mediatype\">[^<]+<", arr)
            split(arr[0],arrg,"(<|>)")
            G["mediatype"] = arrg[2]

       # title (name="title">Title<)
            match(doc[i], "name=\"title\">[^<]+<", arr)
            split(arr[0],arrg,"(<|>)")
            G["title"] = arrg[2]

            currcount = find_searchtokens_full(G, fulln, idstring)
            count = count + currcount
            if(currcount > 0) {
              Result[idstring]["realcount"]++
            }

            if(numfound <= Rows)                   # Abort early - no chance
              base = numfound
            else
              base = Rows
            if(count < ( ( (base / 2) * -1) -1) )
              return -1001                         

        }
        return count
}

function find_searchtokens_full(G, fulln, idstring,	token,subentity,count,cc,ci,ca,da,dc,di,dm,re,stopflag,debug,debugid,debugo) {

       # Enable debug to examine how it scores individual work IDs. Some output saved to file "test" (overwritten with each run)
        debug = 0 # 1=on, 0=off
        debugid = "landscapingfordu00giro"
        debugo = "/home/pepper/wi-awb/debugo"
        if(G["identifier"] ~ debugid && debug) print "Starting" > debugo

        dc = split(TokensFull["_date"], da, " ")
        count = 0

        if( searchwikipedia(G["title"]) ) {               # If the book title is in the Wikipedia article..
          count++
          if(G["identifier"] ~ debugid && debug) {
            print "    Matched:"
            print "+1 in special 1-2 for " subentity >> debugo
          }
          print "  " G["identifier"] " | title | (wikipedia article) | = " G["title"]
        }

        if( length(G["identifier"]) < 1) G["identifier"] = "unknown"

        Scope[idstring][G["identifier"]]["mediatype"] = G["mediatype"]
        Scope[idstring][G["identifier"]]["date"] = 0  # 1 if work is matched on a date
        Scope[idstring][G["identifier"]]["jstor"] = 0  # 1 if work is matched on creator and work is JSTOR

	for(subentity in G) {

          if(G["identifier"] ~ debugid && debug) 
            print "--Starting subentity " subentity

          if(subentity ~ /^collection$/ ) {               # Skip collection:opensource_audio works
            if(G[subentity] ~ /opensource_audio/) {
              return -1
            }
          }
          if(subentity ~ /^mediatype$/ ) {                # Skip mediatype:software works
            if(G[subentity] == "software") {
              return -1
            }
          }
          if(subentity ~ /^identifier$/ ) {               # Skip arxiv works
            if(G[subentity] ~ /arxiv/) {
              return -1
            }
            continue                                      # Skip "identifier" subentity
          }

         # Remove all "-" (same thing done in init_searchtokens())
          G[subentity] = strip(gensub("-"," ","g",G[subentity]))

          stopflag = 0
          tokenflag = 0

	  for(token in TokensFull) {

            if(TokensFull[token] == "") continue

           # Skip the name tokens if 1 has already matched to avoid repeat matches due to regex+agrep
            if(tokenflag == 1 && substr(token,1,1) !~ /_/ ) continue  

            if(token ~ /^_(lastname|firstname|lastname_special)$/) {
              if(G["identifier"] ~ debugid && debug) 
                print "Skipping " token " in " subentity >> debugo
              continue
            }

            agrep_debug = 0
            if(G["identifier"] ~ debugid && debug) 
              agrep_debug = 1

	    if(token == "_date" && subentity !~ /^creator$|^subject/) { # See below for creator/subject + date check

              re = "(" daterange(TokensFull["_birth"]) ").+{1,3}(" daterange(TokensFull["_death"]) ")" 
              if( match(G[subentity], re) && agrep(G[subentity], TokensFull["_lastname"], ".25", agrep_debug, "plain") ) {
                count++
                if(G["identifier"] ~ debugid && debug) {
                  print "    Matched:"
                  print "+1 in _date: G[" subentity "]=|" G[subentity] "| TokensFull[" token "]=|" TokensFull[token] "| lastname=|" TokensFull["_lastname"] "|" >> debugo
                }
                print "      " G["identifier"] " | " subentity " |" TokensFull[token] "| = " G[subentity] " (a)"

                Scope[idstring][G["identifier"]]["date"] = 1  # 1 if work is matched on a date

                continue
              }
            } 

            if(token == "_firstname_special") {
              if( agrep(G[subentity], TokensFull["_firstname_special"], ".25", agrep_debug, "plain") && agrep(G[subentity], TokensFull["_lastname_special"], ".25", agrep_debug, "plain") ) {
                count++
                if(G["identifier"] ~ debugid && debug) {
                  print "    Matched:"
                  print "+1 in special for " subentity >> debugo
                }
                print "      " G["identifier"] " | " subentity " |" TokensFull["_firstname_special"] " " TokensFull["_lastname_special"] "| = " G[subentity] " (b)"
                continue
              }              
            }            

            # If creator or subject string contains a date, and it doesn't match one of the birth-death dates in TokensFull, then don't make a match.
            if(subentity ~ /^creator$|^subject/ && token != "_birth" && token != "_death") {
 
              if(G["identifier"] ~ debugid && debug && stopflag == 0) {
                print " " subentity ": " G[subentity]
                stopflag = 1
#for(z in TokensFull)
#print z " = " TokensFull[z]
#print "subentity = " subentity
#print "token = " token
              }

              cc = split(G[subentity], ca, /<str>|<\/str>/)         # when multi-names are separated by /<str><\/str>

              G[subentity] ~ /<str>/ ? caindex = 2 : caindex = 1

if(G["identifier"] ~ debugid && debug) 
print "caindex(1) = " ca[caindex]

              if(ca[caindex] ~ /;/) {                               # when multi-names are separated by ;   ..this may break in certain cases
                  cc = split(ca[caindex], ca, /;/)
              } else if(ca[caindex] ~ /[ ]and[ ]/) {                # when multi-names are separated by / and / eg. archive.org/details/UsgsBulletin507MiningDistrictsOfTheWesternUnitedStates
                  cc = split(ca[caindex], ca, /[ ]and[ ]/)
              } else if(G["description"] ~ /by user tpb/ && ca[caindex] !~ /[0-9]{4}/ && G[subentity] !~ /<str>/ && length(ca[caindex]) > length(fulln) + 15 ) {    # when multi-names sep by , ("uploaded by user tpb")  
                  cc = split(ca[caindex], ca, /,/)
              }
#if(G["identifier"] ~ debugid && debug) 
#print "caci(-2) = " ca[1]
              ci = 0
              while(ci++ < cc) {

                if(ca[ci] == "") continue

                ca[ci] = strip(ca[ci])

#if(G["identifier"] ~ debugid && debug) 
#print "caci(-1) = " ca[ci]

                gsub(/[[]/,"",ca[ci])
                gsub(/[]]/,"",ca[ci])

#if(G["identifier"] ~ debugid && debug) 
#print "caci(0) = " ca[ci]

                re = "[.|,|;|-]{0,1}[ ]{0,1}from old catalog.*$"                    # remove trailing "[from old catalog]" and variations
                if(ca[ci] ~ re)
                  ca[ci] = rmtrail( gensub(re, "", "g", ca[ci]) )

#if(G["identifier"] ~ debugid && debug) 
#print "caci(0.1) = " ca[ci]

                if(token != "_date") {                                              # leave date in if checking for date otherwise remove it

                  re = "[Bb][.][ ]{0,1}" "(" daterange(TokensFull["_birth"]) ")$"   # remove trailing "b. 1860" 
                  if(ca[ci] ~ re) {
                    ca[ci] = rmtrail( gensub(re, "", "g", ca[ci]) ) 
                  }
#if(G["identifier"] ~ debugid && debug) 
#print "caci(1) = " ca[ci]

                  re = "(" daterange(TokensFull["_birth"]) ").+{1,3}(" daterange(TokensFull["_death"]) ")"
                  ca[ci] = rmtrail( gensub( re, "", "g", ca[ci]) ) # remove date

#if(G["identifier"] ~ debugid && debug) 
#print "caci(2) = " ca[ci]

                  ca[ci] = rmtrail( gensub(/[(][^\)]*[)]$/, "", "g", ca[ci]) )      # remove final parenthesis content eg. Loudon, W. J. (William James)

#if(G["identifier"] ~ debugid && debug) 
#print "caci(3) = " ca[ci]

                  re = "(" daterange(TokensFull["_birth"]) ")[-]{0,1}$"             # remove trailing "1860-" ie. when no death date is given
                  if(ca[ci] ~ re) {
                    ca[ci] = rmtrail( gensub(re, "", "g", ca[ci]) )

#if(G["identifier"] ~ debugid && debug) {
#print "re = " re
#print "caci(4) = " ca[ci]
#}
                  }

                  ca[ci] = tolower(ca[ci])                                    

                  ca[ci] = rmtrail( gensub("sir$", "", "g", ca[ci]) )               # remove trailing "Sir"
                  ca[ci] = strip( gensub("^sir ", "", "g", ca[ci]) )                # remove leading "Sir "
                  ca[ci] = strip( gensub(/annotator|translator/, "", "g", ca[ci]) ) # remove various 

                }

                if(token != "_date") {
                  re = "^" regesc2(tolower(TokensFull[token])) "$"                   # eg. "^william t[.] volk$"
                  gsub("-","",re)                                                   # Is this needed?
                }
                else {
                  # eg, "1860.+{1}1910" which is like 1860?1910 - will match "#-#", "# - #", "# #", etc.. also catch "b. ####$" and "####-$"
                  re = "(" daterange(TokensFull["_birth"]) ").+{1,3}(" daterange(TokensFull["_death"]) ")|([Bb][.][ ]{0,1}(" daterange(TokensFull["_birth"]) ")$)|((" daterange(TokensFull["_birth"]) ")[-]{0,1}$)"  
                }

                agrep_debug = 0
                if(G["identifier"] ~ debugid && debug) {
                  print "  Step 1 (" token "): " tolower(ca[ci]) " | " re
                  agrep_debug = 1
                }

                if(token != "_date" && agrep(ca[ci], re, ".25", agrep_debug, "regex", length(TokensFull[token]) ) )  {
                  dm = 1
                  if(subentity ~ /^creator$/ && tolower(G["identifier"]) ~ /jstor/)
                    Scope[idstring][G["identifier"]]["jstor"] = 1  # 1 if creator field matches and JSTOR
                }

                # Check for lastname: ^roy|[ ]roy|[(]roy
                re2 = "^" regesc2(tolower(TokensFull["_lastname"])) "|[ ]{1,}" regesc2(tolower(TokensFull["_lastname"])) "|[(]" regesc2(tolower(TokensFull["_lastname"])) 
                if(token == "_date" && match(ca[ci], re) && agrep(ca[ci], re2, ".25", agrep_debug, "regex", length(TokensFull["_lastname"]) ) ) { # match date + lastname  
                  dm = 1
                  Scope[idstring][G["identifier"]]["date"] = 1  # 1 if work is matched on a date
                }

                if(dm == 1) {

                  if(G["identifier"] ~ debugid && debug) {
                    print "    Matched:"
                    print "   +1 for " token " in " subentity >> debugo
                  }
                  print "      " G["identifier"] " | " subentity " |" TokensFull[token] "| = " ca[ci]  " (c)"

                  count++
                  dm = 0
                  tokenflag = 1
                }

              }
              continue
            }

            agrep_debug = 0
            if(G["identifier"] ~ debugid && debug) 
              agrep_debug = 1

            agrep_type = "regex"
            agrep_weight = "0.25"

            if(subentity ~ /^description$|^title$/) {
              if(token !~ /_date|_birth|_death/ ) {
                if(TokensFull[token] ~ /[.]/) {
                  re = tolower(TokensFull[token])
                  agrep_type = "plain"
                  agrep_weight = "0.10"
                }
                else 
                  re = "^" regesc2(tolower(TokensFull[token])) "|[ ]{1,}" regesc2(tolower(TokensFull[token])) "|[(]" regesc2(tolower(TokensFull[token])) 
              }
              else
                re = regesc2(tolower(TokensFull[token]))
            } 
            else {
              re = regesc2(tolower(TokensFull[token]))
            }

            if( agrep(tolower(G[subentity]), re, agrep_weight, agrep_debug, agrep_type, length(TokensFull[token]) ) && token != "_birth" && token != "_death" ) {
              count++
              if(G["identifier"] ~ debugid && debug) {
                print "    Matched:"
                print "+1 for " token " (" TokensFull[token] ") in " subentity ": " G[subentity] >> debugo
              }
              print "      " G["identifier"] " | " subentity " |" TokensFull[token] "| = " G[subentity] " (d)"
              tokenflag = 1
            }
          }
        }

        if(datehit(G,TokensFull["_birth"],count) == 0) # This must be the last modifier to count
          count--
        if(G["identifier"] ~ debugid && debug) 
          close(debugo)
        print G["identifier"] " | " count
        if(count > 0) return 1
        return -1

}

# Return 1 if book year is within the range author's year of birth + 20 -> year of death
function datehit(G,birth,count) {

        # Don't penalize audio/video collections which don't have historic dates
        if(G["mediatype"] !~ "texts") 
          return 1

        # Don't penalize Project Gutenberg which have no dates
        if(tolower(G["description"]) ~ /project gutenberg/)
          return 1

        if(birth < 1830 ) # Don't penalize old authors since most of their books on IA will be later-period reprints
          return 1

        if(G["date"] == "" && count == 1) {   # Scrape date only if count is 1 - anything more/less won't matter. This should speed up processing. 
          G["date"] = scrapedate(G["identifier"])    
        }

        print "\nDATE = " G["date"]

        if(G["date"] == "")
          return 0

        if(G["date"] >= TokensFull["_birth"] + 20 && G["date"] <= TokensFull["_death"])
          return 1

        return 0

}

function init_searchtokens_full(qin, firstn, lastn		,a,c,d,e,f,h,j,k,head,dm,safe,aoc,aoa,aoaa,aoi,andorstr) {

        delete TokensFull

        TokensFull["_lastname"] = lastn
        TokensFull["_firstname"] = firstn

      # Populate _first/lastname_special when a unusual search string. See test.awk specialtest() for checking test cases.
      #  Note: these are specific and need to be modified if search string format (qin) ever changes ie. in iaa.lua

        aoc = patsplit(qin, aoa, /AND|OR/)
        while(aoi++ < aoc)
          andorstr = andorstr " " aoa[aoi]
        andorstr = strip(andorstr)

        if(andorstr ~ /AND OR OR AND OR|AND OR AND|AND OR OR AND|AND OR AND OR/ ) {
          split(qin,aoa,/AND|OR/)
          gsub(/[)]|[(]|["]/,"",aoa[2])
          aoi = split(aoa[2],aoaa," ")
          TokensFull["_firstname_special"] = strip(aoaa[1])
          TokensFull["_lastname_special"] = strip(aoaa[aoi])
        }
        else if(andorstr ~ "AND OR AND OR OR AND") {
          split(qin,aoa,/AND|OR/)
          gsub(/[)]|[(]|["]/,"",aoa[2])
          gsub(/[)]|[(]|["]/,"",aoa[4])
          TokensFull["_firstname_special"] = strip(aoaa[2])
          TokensFull["_lastname_special"] = strip(aoaa[4])
        }

        # AND OR OR AND OR
        # 5+word extended ascii 
        # example: (-mediatype:software) AND (((Claude Prosper Jolyot de Crébillon) OR (Claude Prosper Jolyot de Cr*billon)) OR ("1707-1777" AND ("Crébillon" OR "Crebillon")))

        # AND OR OR AND
        # 5+word extended ascii 
        # example: (-mediatype:software) AND ((François Christophe Edmond de Kellermann OR (Fran*ois Christophe Edmond de Kellermann)) OR ("1802-1868" AND "Kellermann"))

        # AND OR AND OR
        # 5+word extended ascii 
        # example: (-mediatype:software) AND ((Claude Prosper Jolyot de Crébillon) OR ("1707-1777" AND ("Crébillon" OR "Crebillon")))
        
        # AND OR AND
        # 5+word extended ascii
        # example: (-mediatype:software) AND ((François Christophe Edmond de Kellermann) OR ("1802-1868" AND "Kellermann"))

        # AND OR AND OR OR AND
        # First char extended ascii special search 
        # example: (-mediatype:software) AND ((("Étienne" OR "Etienne") AND ("Aignan" OR "Aignan")) OR ("1773-1824" AND "Aignan"))
        
	c = split(qin, a, "\"")
        offset = 0 
        if(length(a[1]) > 0) 
	  offset = 1
        i = 0 + offset
        while(i++ < c) {
          if(a[i] ~ /^[0-9]{4}-[0-9]{4}$/) {
            TokensFull["_date"] = a[i]
            i = i + offset
            split(TokensFull["_date"], h, "-")
            TokensFull["_birth"] = h[1]
            TokensFull["_death"] = h[2]
            continue
          }

                  # Filter out 1-word names (unless name really is 1 word)
          if(firstn !~ lastn) {
            d = split(a[i],e," ")
            if(d == 1) {
              i = i + offset
              continue
            }
          }

                  # Filter out when last character is "." eg. "Jéquier, G."
          if( substr(a[i], length(a[i]), 1) ~ /[.]/ ) {
            i = i + offset
            continue
          }
            
                  # Filter out duplicates caused by extended ascii eg. Gustave Jéquier / Gustave Jequier  (note the é/e)
                  #   because agrep will handle these cases in find_searchtokens(). This will break badly if search syntax ever changes.
          if(a[i - 1] ~ /[:]$/)
            TokensFull[a[i]] = a[i]

          i = i + offset
        }

       # Remove all "-" (same thing done in find_searchtokens())
        for(head in TokensFull)
          TokensFull[head] = strip(gensub("-"," ","g",TokensFull[head]))

print qin
print " |5-----|"
for(head in TokensFull)
  print head " = " TokensFull[head]
print " |6-----|"

	  
}

#
# highlight search tokens 
#
function hightokens(str		,c,arr,i,t,out,build,work) {

	c = split(str, arr, " ")
	while(i < c) {
	  i++
          out = ""
          work = arr[i]
          gsub("([,]|\"|[.]$|\"[.]$|'[.]$|`[.]$|;|[:]|^'|'$|^`|`$|'s$|'s[.]$)","",work)               # <-- Customize search token exceptions
	  for ( t in Tokens ) {
	    if ( tolower(t) == tolower(work) ) {
              out = "<span class=\"high\">" arr[i] "</span>"
              break
            }
	  }
          out == "" ? out = arr[i] : ""
          i == 1 ? build = out : build = build " " out
        }
        return build
}
function init_searchtokens(iaquery	,arr, arrg, i, k, c) {

        delete Tokens
	gsub("\"","",iaquery)
	gsub("+"," ",iaquery)
	split(iaquery, arr, "+") 
        k = join(arr, 1, len2(arr), " ")        
        c = split(k, arrg, " ")
        while ( i++ < c) {
          if ( arrg[i] !~ "(AND|OR|:)" ) {           # <-- Customize tokens to ignore eg. logic statements, etc..
            gsub(/[,]+$/,"",arrg[i])
            gsub(/[)]+$/,"",arrg[i])
            gsub(/^[(]+/,"",arrg[i])
            Tokens[arrg[i]] = arrg[i]
          }
        }
}

#
# If date of work is empty, it may be due to bug at Internet Archive caused when date field contains non-ascii characters it returns empty date.
#  See examples paramountmethodf00fode ("c1922") and greeceallies191400abbouoft ("[1922]")
# This function scrapes the main work page searching for "stated date is 1922" in copyright-notice, or failing that looking at the meta.xml file.
#
function scrapedate(wid,      page,a,b,c,d,e,metaxml,debug,debugid,debugo,widu,fname) {

       # Enable debug to examine how it handles individual work IDs. 
        debug = 0 # 1=on, 0=off
        debugid = "visionbowditchhp00bowdrich"
        debugo = "/home/adminuser/wi-awb/testpage"

        widu = gensub(/ /,"_","g",wid) # Convert " " to "_". Should never happen but in case.

        # Read from local cache, if not exist load from web and save to cache
        fname = TDir "iascache/" widu
        if( checkexists( fname ) ) {
          page = readfile( fname )
        }
        else {
          page =  http2var2("http://archive.org/details/" widu)
          print page > fname 
          close( fname )
        }

        if( match(page, "stated date is [0-9]{4}", a) ) {  # If it has a "stated date is XXXX" on the main work page, use that.
          if(wid ~ debugid && debug)
            print "  Step 0: " a[0]
          return strip( substr(a[0], length(a[0]) - 4, length(a[0]) ) )
        }

        if(wid ~ debugid && debug) {
          print page > debugo
          close(debugo)
        }
                                                         # Otherwise, get the meta.xml file and see if it has a <date></date> pair

      
        # Read from local cache, if not exist load from web and save to cache
        fname = TDir "iascache/" widu "_meta.xml"
        if( checkexists( fname ) ) {
          metaxml = readfile( fname )
        }
        else {
          metaxml = http2var2("http://archive.org/download/" widu "/" widu "_meta.xml")
          print metaxml > fname
          close( fname )
        }

        match(metaxml, /[<]collection[>][^<]+[<][/]collection[>]/, e)

        if(tolower(e[0]) ~ /citebank/) {                 # Special case for <collection>citebank</collection> 
          if( match(metaxml, /[<]volume[>][^<]+[<][/]volume[>]/, f) ) {
            if(wid ~ debugid && debug)
              print "  Step 2: " f[0]
            if( match(f[0], /[0-9]{4}/, d) ) {
              return strip(d[0])
            }
          } else {
            if( match(metaxml, /[<]year[>][^<]+[<][/]year[>]/, c) ) {
              if(wid ~ debugid && debug)
                print "  Step 2: " c[0]
              if( match(c[0], /[0-9]{4}/, d) ) {
                return strip(d[0])
              }
            }
          }
        }

        if( match(metaxml, /[<]date[>][^<]+[<][/]date[>]/, c) ) {  # Check for <date></date>
          if(wid ~ debugid && debug)
            print "  Step 2: " c[0]
          if( match(c[0], /[0-9]{4}/, d) ) {
            return strip(d[0])
          }
        }

                                                         # If still nothing, check for <year></year>

        if( match(metaxml, /[<]year[>][^<]+[<][/]year[>]/, c) ) {
          if(wid ~ debugid && debug)
            print "  Step 2: " c[0]
          if( match(c[0], /[0-9]{4}/, d) ) {
            return strip(d[0])
          }
        }

                                                         # If still nothing, check if a date is in publisher field eg. see imperialgazette00meyegoog
        if( match(metaxml, /[<]publisher[>][^<]+[<][/]publisher[>]/, c) ) {
          if(wid ~ debugid && debug)
            print "  Step 2: " c[0]
          if( match(c[0], /[0-9]{4}/, d) ) {
            if(strip(d[0]) > 1200 && strip(d[0]) < MaxYear ) {
              return strip(d[0])
            }
          }
        }

        if(wid ~ debugid && debug)
          print "  Step 3: none found"

        return ""
        
}

#
# Build a passable XML file due to bug in Internet Archive (in some cases) triggered by unknown reasons.
#  1. create a fake XML header 
#  2. download list of identifier's in CSV format (this works for some reason but XML doesn't)
#  3. for each identifier, request *individual* XML (one for each work) and extract the portion between <doc></doc>
#  4. build a complete XML from the parts  
#
function build_xml(head,tail,entity,rows,sort,pagenum, 		csv,out,subxml,sc,doc,a,i,k) {

       qin = urlendecode(entity,"decode")

     # XML header
       out = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" "\n" 
       out = out "<response>" "\n"
       out = out "<lst name=\"responseHeader\">" "\n"
       out = out "<int name=\"status\">0</int>" "\n"
       out = out "<int name=\"QTime\">89</int>" "\n"
       out = out "<lst name=\"params\">" "\n"
       out = out "<str name=\"sort\"></str>" "\n"
       out = out "<str name=\"wt\">xml</str>" "\n"
       out = out "<str name=\"rows\">" rows "</str>" "\n"
       out = out "<str name=\"qin\">" qin "</str>" "\n"
       out = out "<str name=\"fl\">collection,creator,date,description,downloads,identifier,mediatype,publisher,title,year</str>" "\n"
       out = out "<str name=\"start\">0</str>" "\n"
       out = out "</lst>" "\n"
       out = out "</lst>" "\n"
       out = out "<result name=\"response\" numFound=\"15\" start=\"0\">" "\n"

     # Get CSV version (only content type that works)

       tail = "&fl[]=identifier&rows=" rows "&output=csv&callback=callback&save=yes&page=" pagenum
       url = head entity tail
       csv = http2var2(url)

       tail = "&fl[]=date&fl[]=publisher&fl[]=collection&fl[]=creator&fl[]=description&fl[]=downloads&fl[]=identifier&fl[]=mediatype&fl[]=subject&fl[]=title&sort[]=" sort "&sort[]=&sort[]=&rows=1&output=xml&callback=callback&save=yes&page=" pagenum

       c = split(csv, a, "\n")
       while(i++ < c) {
         if(i == 1) continue        # Skip CSV header
         gsub(/"/,"",a[i])          # Remove quotes around CSV field
         entity = "identifier%3A%22" a[i] "%22"
         url = head entity tail
         subxml = http2var2(url)
         gsub(/&lt;/,"<",subxml);gsub(/&gt;/,">",subxml);gsub(/&quot;/,"\"",subxml);gsub(/&amp;/,"\\&",subxml)
         sc = split(subxml, doc, "(<doc>|</doc>)")
         if(length(doc[2]) > 10) {  
           out = out "<doc>" "\n"
           out = out doc[2]
           out = out "</doc>" "\n"
         }
       }

       out = out "</result>" "\n"
       out = out "</response>" "\n"

       gsub(/numFound="15"/,"numFound=\"" c - 1 "\"",out)

       return out

}


#
# Search Wikipedia article for book title. Return true if match found.
#  False positives may arise here if the book title is short or simple phrase
#

function searchwikipedia(booktitle) {


        if( agrep( WPArticle, extracttitle(booktitle), ".25") ) 
            return 1
        return 0
}


#
# Srtip a book title (from IA) down to its bare essential (rm subtitle etc) so that it can be searched for in the WP article.
#
function extracttitle(tit,	c,l,f,h,k,o,p) {

        delete p

       # Convert any XML codes, or ";"
        gsub(/&gt;/,">",tit)
        gsub(/&quot;/,"\"",tit)
        gsub(/&amp;/,"\\&",tit)
        gsub(";",":",tit)

       # Rm all the words which follow one of these characters
        split(tit, h, "(—| - |[(]|:)")
        tit = h[1]

       # Rm special cases
        gsub(", in [Ff]our [Pp]arts.*","",tit)
        gsub("in [Ff]our [Pp]arts.*","",tit)
        gsub(", in [Tt]hree [Pp]arts.*","",tit)
        gsub("in [Tt]hree [Pp]arts.*","",tit)
        gsub(", in [Tt]wo [Pp]arts.*","",tit)
        gsub("in [Tt]wo [Pp]arts.*","",tit)
        gsub(/[Mm]lle[.]/, "Mademoiselle",tit)

       # Rm words following "." except in certain cases
        l = f = k = 0
        k = split(tit,o," ")
        while(l < k) {
          l++
          if(o[l] ~ /[.]/) {  # Bypass these allowed words
            if(o[l] ~ /Mrs[.]|Mr[.]|[A-Z][.]|Inc[.]|Dr[.]|Capt[.]|doma[.]|St[.]|No[.]/) {
              f++
              p[f] = o[l]
            } else {          # Keep first occurance of non-allowed "word." and drop the remaining words
                f++
                p[f] = o[l]
                p[f] = substr(p[f],0,length(p[f]) - 1) # Rm trailing "."
                break
              }
          } else {
            f++
            p[f] = o[l]
          }
        }
        tit = join(p, 1, length(p), " ")

       # Rm words following "," except if first word (eg. "Sidonia, the Sorceress")
        f = l = k = 0
        delete p
        k = split(tit,o," ")
        while(l < k) {
          l++
          if(o[l] ~ /[,]/) {  # Keep if first word
            if(f == 0) {
              f++
              p[f] = o[l]
            } else {
                f++
                p[f] = o[l]
                p[f] = substr(p[f],0,length(p[f]) - 1)
                break
            }
          } else {
            f++
            p[f] = o[l]
          }
        }
        tit = join(p, 1, length(p), " ")

       # Rm last word if "by"
        delete p
        f = 0
        c = split(tit, o, " ")
        if(strip(o[c]) ~ "[Bb]y") {
          while(f < c - 1) {
            f++
            p[f] = o[f]
          }
          tit = join(p, 1, length(p), " ")
        }

       # Rm second to last word if "by"
        delete p
        f = 0
        c = split(tit, o, " ")
        if(strip(o[c - 1]) ~ "[Bb]y") {
          while(f < c - 2) {
            f++
            p[f] = o[f]
          }
          tit = join(p, 1, length(p), " ")
        }

        return strip(tit)

}

#
# Build a RE with a date range 
#
function daterange(givendate,    re) {

        if(givendate)
          re = givendate -3 "|" givendate -2 "|" givendate -1 "|" givendate "|" givendate + 1 "|" givendate + 2 "|" givendate + 3 
        else
          re = "a1b2c3d4e5f6g7h8i9j0"  # nonsense string if no date ie. no match will be made
        return re  
}

# _________________________ utilities _______________________________________________________

# See library.awk for others

#
# http2var - replicate "wget -q -O- http://..." in pure gawk
#   Return the HTML page as a string. 
#
function http2var2(url)
{
     return clean( sys2var( Exe["wget"] " --user-agent=\"" Agent "\" -q -O- \"" url "\"") )
}

#
# Remove certain trailing characters from a string
#  
function rmtrail(str) {

        str = strip(str)

        if( substr( str, length(str) - 2, length(str) ) ~ /[(][)]/ ) # remove trailing ()
          str = substr( str, 1, length(str) - 2 ) 
        if( substr( str, length(str) - 1, length(str) ) ~ /,|;/ )    # remove trailing , or ;
          str = substr( str, 1, length(str) - 1 ) 

        return strip(str)
}

#----------------------------------------------------
# Approximate (fuzzy) matching using agrep
#
#  source  = source text
#  search  = text to search for in source
#  percent = maximum error rate percentage of search.
#            ie. if source is 12 characters and max error rate is 25%, set to ".25"
#                and it will return a match if up to 3 characters are wrong.
#  debug   = if "1", print debug statement.
#  stype   = search string is "regex" or "plain" text. Or "exact" for exact match (case-insensitive)
#  rlength = optional. Length of string without regex characters. Use if using "regex".
#
#  Error rate is hard coded: max out at "6" on the upper and "1" on the lower.
#  Agrep set to case-insensitive
#
#  Return 0 if no match, otherwise number of matches
#
#----------------------------------------------------
function agrep(source, search, percent, debug, stype, rlength,      slength,errorlimit,results,s,command)
{

 if(stype == "")
   stype = "plain"
 slength = length(search)
 if(rlength == "" || rlength == 0)
  rlength = slength
 if(stype == "regex")
   slength = rlength

 # Limit # of errors to 25% of length of str, or no more than 6, whichever is less
  if(slength > 24)
    errorlimit = 6
  else
    errorlimit = int(slength * percent)
  if(errorlimit < 2) {
    if(slength < 6)
      errorlimit = 1
    else
      errorlimit = 2
  }

  gsub("\"","\\\"",search)                       # Escape any " marks
  if( substr(search,slength,1) ~ /\\/ ) {        # If last character is " ..
    s = substr(search, 1, slength - 1)
    search = s
  }

  if(stype == "regex")
    command = Exe["agrep"] " -i -c -" errorlimit  " -- \"" agrepstrip( strip(search) ) "\""
  else if(regex == "exact")
    command = Exe["agrep"] " -i -k -c -0 -- \"" agrepstrip( strip(search) ) "\""
  else if(stype == "plain")
    command = Exe["agrep"] " -i -k -c -" errorlimit  " -- \"" agrepstrip( strip(search) ) "\""
  else
    return 0

  if(debug)
    print "Agrep command = " command

  print agrepstrip(source) |& command
  close(command, "to")
  command |& getline results
  close(command)

  if(results > 0)
    return results
  else
    return 0
}

#
# Length of an array. Portable function for older versions of gawk
#
function len2(array, i) {
	i = 1
	while (i in array) {
            i++
        }
        return i - 1
}

#
# strip HTML (method has general limitations but OK for this app)
#
function striphtml(s) {
	gsub (/<[^>][^>]*>/, "", s)
	return s
}

#
# Remove problem shell characters when running agrep
#
function agrepstrip(str) {

    return gensub(/[`]/, "", "g", str)

}

#
# Count number of occurances of word in str .. presumes space separated
#
function occurances(str, word,   count,c,a,i) {

    count = 0
    c = split(str,a," ")
    while(i++ < c) {
      if(a[i] == word)
        count++
    }
    return count
}

#
# Escape regex symbols
#
function regesc2(str,   safe) {
  safe = str
  gsub(/[][^$*?+{}\\()|]/, "[&]", safe)
  gsub("[\\^]","\\^",safe)     # replace "[^]" with "[\^]"
  gsub("[.]","[&]{0,1}",safe)  # replace "."   with "[.]{0.1}"                
  return safe
}