#!/usr/local/bin/gawk -E # -E required for getopt so doesn't conflict with Gawk command line options # # Algorithm to determine best match. Run "talgo -h" for help. # @include "init.awk" @include "library.awk" #@include "getopt.awk" BEGIN { delete Result delete Final delete Scope Dlevel = 0 # Debug level eg "test -d 2" .. higher number, more output. SaveScope = 0 # Save/create ia9out-scope file if 1, 0 don't save/create Stamp = "" # temp directory ID stamp eg. wi-awb-0118124659 Optind = Opterr = 1 while ((C = getopt(ARGC, ARGV, "sd:p:f:h:l:r:o:2:3:4:5:6:")) != -1) { opts++ if(C == "p") # -p Use project name. Defaults to what's in project.cfg pid = verifypid(Optarg) if(C == "l") { # -l Load from ias.awk output file eg. ias.result Loadfrom["type"] = "ias" Loadfrom["name"] = verifyval(Optarg) # is expected to be the full path/filename } if(C == "f") { # -f "First Last" Load from wi.csh output file Loadfrom["type"] = "file" Loadfrom["name"] = verifyval(Optarg) # Expected to be the author name in quotes eg. "Charles Dickens". Use -p to define project paths. } if(C == "s") { # -s Create/update ia9out-scope file SaveScope = 1 } if(C == "o") { # -o Load from wi.csh output file in "old" directory checkexists(Optarg, "talgo.awk", "exit") readresults("extasciiwildfix.old/" Optarg) Common = iscommon() } if(C == "2") { # -2 Load from wi.csh output file in "old2" directory checkexists(Optarg, "talgo.awk", "exit") readresults("extasciiwildfix.old2/" Optarg) Common = iscommon() } if(C == "3") { # -3 Load from wi.csh output file in "old3" directory checkexists(Optarg, "talgo.awk", "exit") readresults("extasciiwildfix.old3/" Optarg) Common = iscommon() } if(C == "4") { # -4 Load from wi.csh output file in "old4" directory checkexists(Optarg, "talgo.awk", "exit") readresults("extasciiwildfix.old4/" Optarg) Common = iscommon() } if(C == "5") { # -5 Load from wi.csh output file in "old4" directory checkexists(Optarg, "talgo.awk", "exit") readresults("extasciiwildfix.old5/" Optarg) Common = iscommon() } if(C == "6") { # -6 Load from wi.csh output file in "old4" directory checkexists(Optarg, "talgo.awk", "exit") readresults("extasciiwildfix.old6/" Optarg) Common = iscommon() } if(C == "r") { # -r Load from hard-coded default settings Result["tmpO@Gustave_Jéquier@t@2"]["name"] = "tmpO@Gustave_Jéquier@t@2" Result["tmpO@Gustave_Jéquier@t@2"]["count"] = 9 Result["tmpO@Gustave_Jéquier@t@2"]["realcount"] = 10 Result["tmpO@Gustave_Jéquier@t@2"]["numfound"] = 0 Result["tmpO@Gustave_Jéquier@t@2"]["common"] = 1 Result["tmpO@Gustave_Jéquier@none@1"]["name"] = "tmpO@Gustave_Jéquier@none@2" Result["tmpO@Gustave_Jéquier@none@1"]["count"] = 8 Result["tmpO@Gustave_Jéquier@none@1"]["realcount"] = 10 Result["tmpO@Gustave_Jéquier@none@1"]["numfound"] = 0 Result["tmpO@Gustave_Jéquier@none@1"]["common"] = 1 Result["tmpO@Gustave_Jéquier@w@3"]["name"] = "tmpO@Gustave_Jéquier@w@2" Result["tmpO@Gustave_Jéquier@w@3"]["count"] = -1001 Result["tmpO@Gustave_Jéquier@w@3"]["realcount"] = 0 Result["tmpO@Gustave_Jéquier@w@3"]["numfound"] = 0 Result["tmpO@Gustave_Jéquier@w@3"]["common"] = 1 Common = 1 Givenname = "Jéquier" } if(C == "d") # -d <#> Debug level Dlevel = Optarg if(C == "h") { # -h Help usage() exit } } if(opts == "") { usage() exit } setProject(pid) # library.awk .. load Project[] paths via project.cfg # if -p not given, use default noted in project.cfg if(Loadfrom["type"] ~ /ias/) { Filen = Loadfrom["name"] checkexists(Filen, "talgo.awk", "exit") Stamp = a[split(gensub(/[/]ias.result/,"","g",Filen),a,"/")] # Given "/home/adminuser/wi-awb/temp/wi-tempxxx/ias.result", return "wi-tempxxx" loadresults(Filen) Common = iscommon() Givenname = clean( strip( readfile(dirname(Filen) "name.txt") ) ) } else if(Loadfrom["type"] ~ /file/) { if(Loadfrom["name"] ~ /[.]$/) sub(/[.]$/, "~", Loadfrom["name"]) Filen = Project["tcin"] Loadfrom["name"] checkexists(Filen, "talgo.awk", "exit") readresults(Filen) Common = iscommon() } else { print "Unable to determine option -f or -l." exit } # Sort in order none/t/w - this screws things up but probably needs to be done so algo isn't surprised by unusal ordering # PROCINFO["sorted_in"] = "@ind_str_asc" main() } function main( i,sc,sa,si,sb) { if(length(Stamp) > 0) { dbg(1,"./f -d " Stamp) dbg(1,"cd " Project["data"] Stamp) # Read Scope array in from ias.scope tempid = Project["data"] Stamp "/" if(checkexists( tempid "ias.scope")) { sc = split(readfile(tempid "ias.scope"), sa, "\n") while(si++ < sc) { split(sa[si],sb,"|") Scope[strip(sb[1])][strip(sb[2])]["mediatype"] = strip(sb[3]) Scope[strip(sb[1])][strip(sb[2])]["date"] = strip(sb[4]) Scope[strip(sb[1])][strip(sb[2])]["jstor"] = strip(sb[5]) delete sb } } delete sa } if(Common) dbg(1,"Common") if(!Common) dbg(1,"Uncommon") dbg(1,"\nOriginal list:\n--------------") for(o in Result) { dbg(1,o " = " Result[o]["count"] " | " Result[o]["realcount"] " | " Result[o]["numfound"]) i++ } removenegcount() dbg(1,"\nPost-remove negative:\n--------------") for(o in Result) { dbg(1, o " = " Result[o]["count"] " | " Result[o]["realcount"] " | " Result[o]["numfound"]) } removegarbage() if( arraycount(Result) != i) { i = 0 dbg(1,"\nPost-remove garbage:\n--------------") for(o in Result) { dbg(1, o " = " Result[o]["count"] " | " Result[o]["realcount"] " | " Result[o]["numfound"]) i++ } } removeshortnames() if( arraycount(Result) != i) { i = 0 dbg(1,"\nPost-remove shortnames:\n--------------") for(o in Result) { dbg(1,o " = " Result[o]["count"] " | " Result[o]["realcount"] " | " Result[o]["numfound"]) i++ } } algo_version1() if( arraycount(Result) != i) { i = 0 dbg(1,"\nPost-remove algo_version1:\n--------------") for(o in Result) { dbg(1,o " = " Result[o]["count"] " | " Result[o]["realcount"] " | " Result[o]["numfound"]) i++ } } # Disabled. Too many false negatives # algo_scope() # if( arraycount(Result) != i) { # i = 0 # dbg(1,"\nPost-remove algo_scope:\n--------------") # for(o in Result) { # dbg(1,o " = " Result[o]["count"] " | " Result[o]["realcount"] " | " Result[o]["numfound"]) # i++ # } # } if(arraycount(Result) > 1) algo_original() if( length(Result) > 0 ) { for(o in Result) { if(Loadfrom["type"] ~ /ias/) { print o > dirname(Filen) "ias.out" close(dirname(Filen) "ias.out") } else dbg(1,"\nFinal: " o) } } if( length(Secondfinal) > 0 && Loadfrom["type"] ~ /ias/) { for(o in Secondfinal) { print o >> dirname(Filen) "ias.secondary" close(dirname(Filen) "ias.secondary") } } } # # Based on data from Scope[] array imported from ias, decide to remove any Results # function algo_scope( okd,okm,s1,s2,textflag,hold,i,js) { # if name is three words, none type and none have a Scope["date"] (ie. none were matched based on dates) then send to scope okd = 0 for(o in Result) { for(s1 in Scope) { if( o == s1 ) { for(s2 in Scope[s1]) { if(Scope[s1][s2]["date"] == 1) okd = 1 } } } if( ! okd) { if( numwords(o) == 3 && entity(o, "type") ~ /none/) copyarray3d(Result[o],hold,o) } # if( ! okd) { # if( numwords(o) == 3 && entity(o, "type") ~ /none/) { # delete Result[o] # if(SaveScope) # sendto(Project["scope"], Givenname, o) # } # } } # For any marked as scoped above (in hold[o]), make sure %40 or more are not JSTOR (ie. JSTOR texts that matched on creator) for(o in hold) { i = js = 0 for(s1 in Scope) { if( o == s1 ) { for(s2 in Scope[s1]) { i++ if(Scope[s1][s2]["jstor"] == 1) js++ } } } if(js > 0 && js <= i) { if( percent(js,i) < 40 && percent(js,i) != 0 ) { # If 40% or less are JSTOR positive, send to scope delete Result[o] if(SaveScope) sendto(Project["scope"], Givenname, o) } } if(js == 0) { delete Result[o] if(SaveScope) sendto(Project["scope"], Givenname, o) } } # if none have a Scope["date"] and none are mediatype texts okd = okm = 0 for(o in Result) { for(s1 in Scope) { if( o == s1 ) { for(s2 in Scope[s1]) { if(Scope[s1][s2]["date"] == 1) okd = 1 if(Scope[s1][s2]["mediatype"] ~ /texts/) okm = 1 } } } if( ! okd && ! okm) { delete Result[o] if(SaveScope) sendto(Project["scope"], Givenname, o) } } } # # Core algo - mess .. split up? # function algo_version1( hold,hold2,uniqnames,o,u,keep,ucount) { delete hold2 delete uniqnames delete hold hold["name"] = "" hold["count"] = -1 hold["realcount"] = -1 hold["numfound"] = -1 hold["common"] = -1 dbg(1," ") if(arraycount(Result) == 1 || arraycount(Result) == 0) return # Build a list of uniq names eg. "Charlotte_Bronte", "Charlette_Sarah_Bronte" etc.. for(o in Result) { if(uniqnames[entity(o, "name")] == "") uniqnames[entity(o, "name")] = entity(o, "name") } # For each uniq name, loop through the Result list and only process those with that name for(u in uniqnames) { dbg(2,"Outer Loop for " u) ucount = 0 # Count number of Results for this name for(o in Result) { if( entity( Result[o]["name"],"name") == u) ucount++ } for(o in Result) { if( entity( Result[o]["name"],"name") == u) { dbg(2,"Inner Loop for " o) if(hold["name"] == "") { # Seed the first name if( ucount > 1 && entity(o, "type") ~ /^w$/ && ! sauce(Result,o,"0.5") && numwords(o) > 1 ) { dbg(2,"A. Skipping " o " - w in seed fails sauce." ) continue } else { copyarray(Result,hold,o) hold2[entity(o,"name")] = hold["name"] dbg(2,"A. HOLD2[" entity(o,"name") "] ADDED: " hold2[entity(o,"name")]) continue } } if(int(Result[o]["realcount"]) == int(hold["realcount"])) { if( ! Common ) { if( int(Result[o]["numfound"]) > int(hold["numfound"]) ) { # If result is "w" and sauce == 1 if( entity(o, "type") ~ /^w$/) { if( int(Result[o]["realcount"]) == int(hold["numfound"]) ) { # skip it } else if( sauce(Result,o,"0.5") == 1) { copyarray(Result,hold,o) hold2[entity(o,"name")] = hold["name"] dbg(2,"B. HOLD2[" entity(o,"name") "] ADDED: " hold2[entity(o,"name")]) } } else { # If sauce result but not sauce hold, replace hold with result if( sauce(Result,o,"0.5") && ! sauce_algo(hold["count"],hold["realcount"],hold["numfound"],"0.5") ) { copyarray(Result,hold,o) hold2[entity(o,"name")] = hold["name"] dbg(2,"C. HOLD2[" entity(o,"name") "] ADDED: " hold2[entity(o,"name")]) } } } else if( int(Result[o]["numfound"]) == int(hold["numfound"]) && entity(o, "type") ~ /^none$/) { copyarray(Result,hold,o) hold2[entity(o,"name")] = hold["name"] dbg(2,"D. HOLD2[" entity(o,"name") "] ADDED: " hold2[entity(o,"name")]) } } else { # if Result = "t" and hold != "t" if( int(Result[o]["numfound"]) >= int(hold["numfound"]) && entity(o, "type") ~ /^t$/ && sauce(Result,o,"0.4") == 1) { copyarray(Result,hold,o) hold2[entity(o,"name")] = hold["name"] dbg(2,"E. HOLD2[" entity(o,"name") "] ADDED: " hold2[entity(o,"name")]) } # Always replace "w" with something else else if( entity(hold["name"],"type") ~ /^w$/ && sauce(Result,o,"0.4") == 1 ) { copyarray(Result,hold,o) hold2[entity(o,"name")] = hold["name"] dbg(2,"F. HOLD2[" entity(o,"name") "] ADDED: " hold2[entity(o,"name")]) } # If sauce hold = 0 and sauce Result = 1 else if( sauce_algo(hold["count"],hold["realcount"],hold["numfound"],"0.4") == 0 && sauce(Result,o,"0.4") == 1) { copyarray(Result,hold,o) hold2[entity(o,"name")] = hold["name"] dbg(2,"G. HOLD2[" entity(o,"name") "] ADDED: " hold2[entity(o,"name")]) } # If Result numfound > hold and sauce Result = 1 else if(int(Result[o]["numfound"]) > int(hold["numfound"]) && sauce(Result,o,"0.5") == 1) { copyarray(Result,hold,o) hold2[entity(o,"name")] = hold["name"] dbg(2,"G1. HOLD2[" entity(o,"name") "] ADDED: " hold2[entity(o,"name")]) } } } else { if( ! Common ) { if(int(Result[o]["realcount"]) > int(hold["realcount"])) { # If result is "w" and sauce == 1 if( entity(o, "type") ~ /^w$/) { if( sauce(Result,o,"0.5") == 1) { copyarray(Result,hold,o) hold2[entity(o,"name")] = hold["name"] dbg(2,"H. HOLD2[" entity(o,"name") "] ADDED: " hold2[entity(o,"name")]) } } # If result is anything but "w" else { copyarray(Result,hold,o) hold2[entity(o,"name")] = hold["name"] dbg(2,"I. HOLD2[" entity(o,"name") "] ADDED: " hold2[entity(o,"name")]) } } } else { # Choose the largest of "w" or "t" realcount - if none available default to "none" (seed) if(entity(o, "type") ~ /^w$|^t$/) { # If hold is a t and result is a w, choose the w if it has more realcount, and sauce Realcount == 1 if( int(Result[o]["realcount"]) > int(hold["realcount"]) && entity(o, "type") ~ /^w$/ && entity(hold["name"],"type") ~ /^t$/ && sauce(Result,o,"0.4") ) { if( entity(o, "type") ~ /^w$/ ) { # If w realcount is within 5% of hold realcount then leave hold alone. if( percent( int(hold["realcount"]), int(Result[o]["realcount"]) ) < 95) { copyarray(Result,hold,o) hold2[entity(o,"name")] = hold["name"] dbg(2,"J1. HOLD2[" entity(o,"name") "] ADDED: " hold2[entity(o,"name")]) } # ..except if result numfound is < 10% of hold numfound else if( int(Result[o]["numfound"]) > int(hold["numfound"]) && percent( int(hold["numfound"]), int(Result[o]["numfound"]) ) > 90) { copyarray(Result,hold,o) hold2[entity(o,"name")] = hold["name"] dbg(2,"J2. HOLD2[" entity(o,"name") "] ADDED: " hold2[entity(o,"name")]) } } } # If hold is a w and result is a t, choose the t if it has more than or equal to realcount else if( int(Result[o]["realcount"]) >= int(hold["realcount"]) && entity(o, "type") ~ /^t$/ && entity(hold["name"],"type") ~ /^w$/) { copyarray(Result,hold,o) hold2[entity(o,"name")] = hold["name"] dbg(2,"K. HOLD2[" entity(o,"name") "] ADDED: " hold2[entity(o,"name")]) } # Default to wipe out none if a t/w exists, the t/w has > realcount, etc else if( entity(hold["name"],"type") ~ /^none$/ && int(hold["realcount"]) < int(Result[o]["realcount"]) && int(Result[o]["count"]) != -1001 && sauce(Result,o,"0.4") ) { if( entity(o, "type") ~ /^w$/ ) { # If w realcount is within 5% of hold realcount then leave hold alone. if( percent( int(hold["realcount"]), int(Result[o]["realcount"]) ) < 95) { copyarray(Result,hold,o) hold2[entity(o,"name")] = hold["name"] dbg(2,"L1. HOLD2[" entity(o,"name") "] ADDED: " hold2[entity(o,"name")]) } # ..except if result numfound is < 10% of hold numfound else if( int(Result[o]["numfound"]) > int(hold["numfound"]) && percent( int(hold["numfound"]), int(Result[o]["numfound"]) ) > 90) { copyarray(Result,hold,o) hold2[entity(o,"name")] = hold["name"] dbg(2,"L2. HOLD2[" entity(o,"name") "] ADDED: " hold2[entity(o,"name")]) } } else { copyarray(Result,hold,o) hold2[entity(o,"name")] = hold["name"] dbg(2,"L3. HOLD2[" entity(o,"name") "] ADDED: " hold2[entity(o,"name")]) } } # Replace none if it fails sauce and w/t passes sauce else if( entity(hold["name"],"type") ~ /^none$/ && sauce(Result,o,"0.4") && ! sauce_algo(hold["count"],hold["realcount"],hold["numfound"],"0.4")) { copyarray(Result,hold,o) hold2[entity(o,"name")] = hold["name"] dbg(2,"L4. HOLD2[" entity(o,"name") "] ADDED: " hold2[entity(o,"name")]) } } else { if( entity(Result[o]["name"],"type") ~ /^none$/ ) { # Result realcount > hold realcount, sauce -- replace hold (w or t) with none if( int(Result[o]["realcount"]) > int(hold["realcount"]) && sauce(Result,o,"0.5") ) { copyarray(Result,hold,o) hold2[entity(o,"name")] = hold["name"] dbg(2,"M. HOLD2[" entity(o,"name") "] ADDED: " hold2[entity(o,"name")]) } # If Result and hold both fail sauce, pick the largest realcount, except if 1-word name then pick largest numfound (see "Molière") else { if( numwords(o) == 1 && int(Result[o]["numfound"]) > int(hold["numfound"]) && ! sauce(Result,o,"0.4") && ! sauce_algo(hold["count"],hold["realcount"],hold["numfound"],"0.4") && int(Result[o]["count"]) != -1001 ) { copyarray(Result,hold,o) hold2[entity(o,"name")] = hold["name"] dbg(2,"N1. HOLD2[" entity(o,"name") "] ADDED: " hold2[entity(o,"name")]) } else if(numwords(o) > 1 && int(Result[o]["realcount"]) > int(hold["realcount"]) && ! sauce(Result,o,"0.4") && ! sauce_algo(hold["count"],hold["realcount"],hold["numfound"],"0.4") && int(Result[o]["count"]) != -1001 ) { copyarray(Result,hold,o) hold2[entity(o,"name")] = hold["name"] dbg(2,"N2. HOLD2[" entity(o,"name") "] ADDED: " hold2[entity(o,"name")]) } } } } } } } } delete hold } for(o in Result) { # Delete all records in Result that are not in hold2[] (leaving one record per uniq name) keep = 0 for(u in hold2) { if(Result[o]["name"] == hold2[u]) { keep = 1 } } if(keep == 0) delete Result[o] } # Remove records with a different last name than Givenname and mark it as secondary. # Mark it as secondary for later manual processing? for(o in Result) { # print "1. |" qlastname(entity(Result[o]["name"],"name")) "| = |" qlastname(Givenname) "|" if( match( strip(qlastname(entity(Result[o]["name"],"name"))), strip(qlastname(Givenname))) == 0 ) { dbg(1, "\nSecond Final: " o) copyarray3d(Result[o],Secondfinal,o) delete Result[o] } } delete hold hold["name"] = "" hold["count"] = -1 hold["realcount"] = -1 hold["numfound"] = -1 hold["common"] = -1 # Run nearly-same algo as above on remaining Result records dbg(2,"----") for(o in Result) { dbg(2,"Inner Loop for " o) if(hold["name"] == "") { # Seed the first name copyarray(Result,hold,o) dbg(2,"0. HOLD[" entity(o,"name") "] ADDED: " hold["name"]) continue } if(int(Result[o]["realcount"]) == int(hold["realcount"])) { if( ! Common ) { if( int(Result[o]["numfound"]) > int(hold["numfound"]) ) { # If result is "w" and sauce == 1 if( entity(o, "type") ~ /^w$/) { if( int(Result[o]["realcount"]) == int(hold["numfound"]) ) { # skip it } else if( sauce(Result,o,"0.5") == 1) { copyarray(Result,hold,o) dbg(2,"1. HOLD[" entity(o,"name") "] ADDED: " hold["name"]) } } # If result is anything but "w" else { copyarray(Result,hold,o) dbg(2,"2. HOLD[" entity(o,"name") "] ADDED: " hold["name"]) } } if( int(Result[o]["numfound"]) == int(hold["numfound"]) && entity(o, "type") ~ /^none$/) { copyarray(Result,hold,o) dbg(2,"3. HOLD[" entity(o,"name") "] ADDED: " hold["name"]) } else if( int(Result[o]["numfound"]) == int(hold["numfound"]) && numwords(Result[o]["name"]) > numwords(hold["name"]) ) { copyarray(Result,hold,o) dbg(2,"3.2 HOLD[" entity(o,"name") "] ADDED: " hold["name"]) } } else { if( int(Result[o]["numfound"]) >= int(hold["numfound"]) && entity(o, "type") ~ /^t$/ && entity(hold["name"],"type") !~ /^t$/) { copyarray(Result,hold,o) dbg(2,"4. HOLD[" entity(o,"name") "] ADDED: " hold["name"]) } else if( int(Result[o]["numfound"]) >= int(hold["numfound"]) && entity(hold["name"],"type") ~ /^w$/ && sauce(Result,o,"0.4") == 1) { # Always replace a hold of "w" with something else copyarray(Result,hold,o) dbg(2,"5. HOLD[" entity(o,"name") "] ADDED: " hold["name"]) } # if numfound equal, replace if sauce hold = 0 and sauce Result = 1 else if( int(Result[o]["numfound"]) == int(hold["numfound"]) && sauce_algo(hold["count"],hold["realcount"],hold["numfound"],"0.4") == 0 && sauce(Result,o,"0.4") == 1) { copyarray(Result,hold,o) dbg(2,"6. HOLD[" entity(o,"name") "] ADDED: " hold["name"]) } else if( int(Result[o]["numfound"]) == int(hold["numfound"]) && length(o) > length(hold["name"]) ) { copyarray(Result,hold,o) dbg(2,"7. HOLD[" entity(o,"name") "] ADDED: " hold["name"]) } } } else { if( ! Common ) { if(int(Result[o]["realcount"]) > int(hold["realcount"])) { # If result is "w" and sauce == 1 if( entity(o, "type") ~ /^w$/) { if( sauce(Result,o,"0.5") == 1) { copyarray(Result,hold,o) dbg(2,"8. HOLD[" entity(o,"name") "] ADDED: " hold["name"]) } } # If result is anything but "w" else { copyarray(Result,hold,o) dbg(2,"9. HOLD[" entity(o,"name") "] ADDED: " hold["name"]) } } } else { # Choose the largest of "w" or "t" realcount - default to "none" (seed) if(entity(o, "type") ~ /^w$|^t$/) { if( entity(o, "type") ~ /^w$/ && int(Result[o]["realcount"]) > int(hold["realcount"]) && entity(hold["name"],"type") ~ /^t$/ && sauce(Result,o,"0.5") ) { copyarray(Result,hold,o) dbg(2,"10. HOLD[" entity(o,"name") "] ADDED: " hold["name"]) } else if( int(Result[o]["realcount"]) >= int(hold["realcount"]) && entity(o, "type") ~ /^t$/ && entity(hold["name"],"type") ~ /^w$/) { copyarray(Result,hold,o) dbg(2,"11. HOLD[" entity(o,"name") "] ADDED: " hold["name"]) } else if( int(Result[o]["realcount"]) > int(hold["realcount"]) && entity(o, "type") ~ /^t$/ && entity(hold["name"],"type") ~ /^t$/ ) { copyarray(Result,hold,o) dbg(2,"12. HOLD[" entity(o,"name") "] ADDED: " hold["name"]) } else if( int(Result[o]["realcount"]) > int(hold["realcount"]) && entity(o, "type") ~ /^w$/ && entity(hold["name"],"type") ~ /^w$/ && sauce(Result,o,"0.5") ) { copyarray(Result,hold,o) dbg(2,"13. HOLD[" entity(o,"name") "] ADDED: " hold["name"]) } else if( int(Result[o]["realcount"]) > int(hold["realcount"]) && entity(o, "type") ~ /^w$/ && entity(hold["name"],"type") ~ /^w$/ && ! sauce(Result,o,"0.4") && ! sauce(Result,o,"0.4") ) { copyarray(Result,hold,o) dbg(2,"14. HOLD[" entity(o,"name") "] ADDED: " hold["name"]) } else if( entity(hold["name"],"type") ~ /^none$/ && int(hold["realcount"]) < int(Result[o]["realcount"]) ) { copyarray(Result,hold,o) dbg(2,"15. HOLD[" entity(o,"name") "] ADDED: " hold["name"]) } } else { if( entity(Result[o]["name"],"type") ~ /^none$/ ) { if( int(Result[o]["realcount"]) > int(hold["realcount"]) ) { copyarray(Result,hold,o) dbg(2,"16. HOLD[" entity(o,"name") "] ADDED: " hold["name"]) } else if( int(Result[o]["realcount"]) == int(hold["realcount"]) ) { if( numwords(o) <= 2 && int(Result[o]["numfound"]) >= int(hold["numfound"]) && sauce(Result,o,"0.4") ) { copyarray(Result,hold,o) dbg(2,"17. HOLD[" entity(o,"name") "] ADDED: " hold["name"]) } } else if(int(Result[o]["realcount"]) > int(hold["realcount"]) && ! sauce(Result,o,"0.4") && ! sauce_algo(hold["count"],hold["realcount"],hold["numfound"],"0.4") && entity(hold["name"],"type") !~ /^t$/) { copyarray(Result,hold,o) dbg(2,"18. HOLD2[" entity(o,"name") "] ADDED: " hold2[entity(o,"name")]) } } } } } } dbg(1, "Final Hold = " hold["name"] ) for(o in Result) { # Trim Result to a single record = hold["name"] if(o !~ hold["name"]) { delete Result[o] dbg(2,"Deleting " o) } } # if(length(Result) == 0) # for(o in Secondfinal) # copyarray3d(Secondfinal[o],Result,o) } # # Remove items in Result except the set with the most words. # function removeshortnames( o,current,hold,a) { delete hold for(o in Result) { # Find set with most words if(numwords(o) > current) { current = numwords(o) copyarray(Result[o], hold, o) } } # Don't do it if the first names are different. For cases like "John Smith Jones" vs. "Smith Jones" # exception: If name is a single word equal to the lastname of Givenname then do it for(o in Result) { if( int(numwords(o)) == int(1) && int(entity(o, "id")) == int(19) ) { a[qfirstname(entity(o,"name"))] = qfirstname(entity(o,"name")) } else if( int(numwords(o)) == 1 && entity(o,"name") ~ qlastname(Givenname)) { } else { a[qfirstname(entity(o,"name"))] = qfirstname(entity(o,"name")) } } if(length(a) > 1) return if(current <= 3) { # Only do it if the longest name is <= 3 words, and last name of longest name is = Givenname for(o in Result) { if(numwords(o) < current && qlastname(entity(hold["name"],"name")) == qlastname(Givenname) ) delete Result[o] } } else { # Else? } } # # Remove garbage Result names (bad parse of the wikisource by getnames.awk) # function removegarbage( o) { for(o in Result) { if( substr( entity(o,"name"), 1, 1) ~ /[a-z]/ ) # If name begins with lowercase letter a-z delete Result[o] } } # # Remove items in Result with neg ["count"] value OR zero realcount OR zero numfound # function removenegcount( o,i) { for(o in Result) { if(Result[o]["count"] < 1 && Result[o]["numfound"] > 0 ) i++ } # Comment out when doing production runs. If backtesting existing templates (eg. internetarchive9e.csh) add this back in. # if(i == length(Result) ) { # All entries are negative, choose the t if it has > 1 realcount # return # } for(o in Result) { if(Result[o]["count"] < 0 || Result[o]["realcount"] < 1 || Result[o]["numfound"] < 1 ) delete Result[o] } } # # Return the requested entity from an id string # eg. id = tmpO@Gustave_Jéquier@w@3 # entity = "name" (second field) or "type" (third field) # function entity(id, type, a,c) { c = split(id, a, "@") if(type !~ /^name$|^type$|^id$/ || c != 4) { #print "ias.awk: error in function entity(): wrong id or type" #exit return "" } if(type ~ /^name$/) return a[2] if(type ~ /^type$/) return a[3] if(type ~ /^id$/) return a[4] } # # Does Result contain a common name? # function iscommon() { for(o in Result) { if(Result[o]["common"] > 0) return 1 } return 0 } # # Count numer of elements in an array # function arraycount(arr, i,o) { i = 0 for(o in arr) i++ return i } # # Return number of words in a name. Given an id string eg. tmpO@Gustave_Jéquier@w@3 returns 2 # function numwords(id, a) { return split( entity(id, "name"), a, "_") } # # Remove item of type in Result, but only if there is at least 1 item remaining # # function removetype(type, o){ if(arraycount(Result) == 1) return for(o in Result) { if( entity(o, "type") ~ type ) { delete Result[o] if(arraycount(Result) == 1) return } } } function algo_original( o) { # Algorithm logic: # For each name (eg. "tmpO@Gustave_Jéquier@w@3") look at: # count = weighted total score (+1 for a positive hit, -1 for no hit) # realcount = absolute total number of positive hits # numfound = total books found (regardless of pos/neg hit) # Filter out and ignore name's with a negative count. # For the remainder, choose the one with the greater realcount. # If there is a tie (realcount), choose the one with the greater numfound. # If there is a tie (numfound), choose the one with the longer name (eg. favor "none" over "t", 3-let names over 2-let) Final["name"] = "" Final["count"] = -1 Final["realcount"] = -1 Final["numfound"] = -1 Final["common"] = -1 # if 1, then contains a common name. See common.awk dbg(1,"\nFinal algo:\n-------") for(o in Result) { print o " = " Result[o]["count"] " | " Result[o]["realcount"] " | " Result[o]["numfound"] if(Result[o]["count"] >= 0) { if(int(Result[o]["realcount"]) > int(Final["realcount"])) { copyarray(Result,Final,o) dbg(2,"Final(1) = " Final["name"] " " Final["realcount"]) } else if(int(Result[o]["realcount"]) == int(Final["realcount"])) { dbg(2,"Final(2) = " Final["name"]) if(int(Result[o]["numfound"]) > int(Final["numfound"])) { copyarray(Result,Final,o) dbg(2,"Final(3) = " Final["name"]) } else if(int(Result[o]["numfound"]) == int(Final["numfound"])) { dbg(2,"Final(5) = " Final["name"]) if( int(length(o)) > int(length(Final["name"])) ) { copyarray(Result,Final,o) dbg(2,"Final(4) = " Final["name"]) } } } } } # Clear Result of all but final result for(o in Result) { if( Result[o]["name"] != Final["name"]) { delete Result[o] } } } # # Copy Result into a flat array # function copyarray(result, final, name) { final["name"] = name final["count"] = Result[name]["count"] final["realcount"] = Result[name]["realcount"] final["numfound"] = Result[name]["numfound"] final["common"] = Result[name]["common"] } # # Copy Result into a 3D array # function copyarray3d(result, final, name) { final[name]["name"] = name final[name]["count"] = Result[name]["count"] final[name]["realcount"] = Result[name]["realcount"] final[name]["numfound"] = Result[name]["numfound"] final[name]["common"] = Result[name]["common"] } # # Copy 3d arr to Result # function copyarray2(arr, name) { Result[name]["name"] = arr[name]["name"] Result[name]["count"] = arr[name]["count"] Result[name]["realcount"] = arr[name]["realcount"] Result[name]["numfound"] = arr[name]["numfound"] Result[name]["common"] = arr[name]["common"] } # # Load results from results.ias in temp directory (generated by ias.awk) # function loadresults(filen, str,c,a,i,d,b,e) { str = clean( readfile(filen) ) c = split(str,a,"\n") if(c < 1) return 0 while(i++ < c) { if(length(a[i]) > 0) { d = split(strip(a[i]),b,"|") Result[strip(b[1])]["name"] = strip(b[1]) Result[strip(b[1])]["count"] = strip(b[2]) Result[strip(b[1])]["realcount"] = strip(b[3]) Result[strip(b[1])]["numfound"] = strip(b[4]) split(strip(b[1]), e, "@") # eg. tmpO@Adam_Heinrich_Müller@none@6 Result[strip(b[1])]["common"] = commonname( gensub(/_/," ", "g", e[2]) ) # Flag if a common name } } } function readresults(filen, str,i,c,e,out,a,inloop,inbound,o,tempdir) { tempdir = Project["data"] "wi-awb" while ((getline str < filen ) > 0) { str = clean(str) if(str ~ /^Internet Archive[0-9]/) { # Get the passed name. Only needed in -f mode gsub(/[)]{2}/,")",str) c = split(str,a,"(") if(c==3) out = a[2] "(" a[3] if(c==2) { out = a[2] gsub(")","",out) } gsub(/[(][^)]+[)]/,"",out) Givenname = strip(out) } if(str ~ tempdir) { # Get the stamp ID. Only used in -f mode. This is lazy hard coded. c = split(str,a,"/") Stamp = a[c - 1] } if(str ~ /\|2---/) inloop = 1 if(inloop) { if(str ~ /tmpO/ && str !~ /Final/) inbound[++i] = strip(str) } if(str ~ /Abort. Reason/) print "\n" str } close(filen) print for(o in inbound) { c = split(inbound[o], a, "|") # eg. tmpO@Adam_Heinrich_Müller@none@6 | 35 | 35 | 36 Result[strip(a[1])]["name"] = strip(a[1]) Result[strip(a[1])]["count"] = strip(a[2]) Result[strip(a[1])]["realcount"] = strip(a[3]) Result[strip(a[1])]["numfound"] = strip(a[4]) c = split(strip(a[1]), e, "@") # eg. tmpO@Adam_Heinrich_Müller@none@6 Result[strip(a[1])]["common"] = commonname( gensub(/_/," ", "g", e[2]) ) # Flag if a common name } print } # # Return 1 if a common name. # function commonname(cname, str,a,c,filen,searchname,sfn,re) { c = split(cname, a, " ") checkexists("common-first", "talgo.awk commonname()", "exit") checkexists("common-last", "talgo.awk commonname()", "exit") delete filen filen[1] = Home "common-first" filen[2] = Home "common-last" for(sfn in filen) { if(filen[sfn] ~ /common[-]first/) searchname = a[1] if(filen[sfn] ~ /common[-]last/) searchname = a[c] while ((getline str < filen[sfn] ) > 0) { re = "^" searchname "$" if(str ~ re) { close(filen[sfn]) return 1 } } close(filen[sfn]) } return 0 } # # Given a set of three numbers: 61 | 94 | 127 # Calculate: 127 - 61 = 66 # 66 * 0.5 = 33 # 61 + 33 = 94 # if 94 >= 94 return 1, else return 0 # Adjust the 0.5 upwards to be more leinent ie. return 1 more often # return 1 = accept, 0 = reject # function sauce_algo(count,realcount,numfound,factor, i) { command = Exe["sauce"] " -s \"" count "|" realcount "|" numfound "|" factor "\"" i = sys2var(command) return i } function sauce(arr,ix,factor) { return sauce_algo(int(arr[ix]["count"]), int(arr[ix]["realcount"]), int(arr[ix]["numfound"]), factor ) } # # Return the last name of a str eg. "John Smith" return "Smith" # function qlastname(str, a,c) { gsub(/_/," ",str) c = split(str,a," ") return a[c] } # # Return the first name of a str eg. "John Smith" return "John" # function qfirstname(str, a) { gsub(/_/," ",str) split(str,a," ") return a[1] } # # Print debug statement of certain level and lower # function dbg(level, str) { if(level <= Dlevel) { print str return } } function usage() { print "talgo -- Run test algorithms using input from ias.awk" print "" print "Options are: (required: f or l or r)" print " -f \"First Last\" Read from meta files" print " -p Use project name. If none default to project.cfg -> default.id" print " -s Don't create/update ia9out-scope file (recommended when using -f)" print " -r Load from hard coded default settings (debug function)" print " -d# Debug statement level. Where # is 1 to X with higher numbers greater output." print " -l Load from ias.awk output file eg. ias.result - caution: only use with other programs." print " -h Help" print "" print "Example: talgo -s -d2 -p births1870.0251-0500 -f \"Charles Dickens\"" print "" }