#!/usr/local/bin/awk -E @include "init.awk" @include "library.awk" #@include "getopt.awk" BEGIN { # ################################################################################################ # # Internet Archive Works on Wikipedia (iawow) # derived from Internet Archive Classic Search (iacs) # all by Stephen Balbach May 2015 / February 2016 # iacs2015@nym.hush.com # MIT License # # Requirement: GNU Awk 4.1+ or greater. Other dependencies in init.awk # # ____________________________________________________________________________________________ # # Configuration variables: # # 1. Program filename - the name of the script. MyProg = "iawow.awk" # 2. Run as a "standalone" web server on a PC; or as a "cgi" script on an existing webserver. # RunType = "standalone" # RunType = "cgi" # 3. If RunType = "standalone" set the following two variables. # # 3.1. Port to run the server on. Can be anything. May need to modify your firewall to allow access. # Ignore this is if RunType is "cgi". MyPort = "8080" # 3.2 Hostname - either a working domain name, a static IP, or if running # on the same computer you're browsing from in "standalone" mode set to "localhost". # Ignore this if RunType is "cgi". MyHost = "localhost" # 5. Number of results per page (50 was the IA default). Can by anything based on # your computer speed and memory etc.. Rows = 50 # 6. Page index block size. This is the number of pages displayed in the index bar before # the "Next" button. PBSize = 20 # # 7. URL Agent string - this is what Internet Archive sees in their logs. Recommend # setting the name of the program and your contact information, such as an email # or your Internet Archive userid. Agent = "Internet Archive Works on Wikipedia (iawow) (iacs2015@nym.hush.com)" # # 8. To run in "standalone" mode: # Start the server: awk -f iacs.awk # To access: http://:/ # eg. http://localhost:8080/ # # # ################################################################################################## Optind = Opterr = 1 while ((C = getopt(ARGC, ARGV, "p:")) != -1) { if(C == "p") pid = verifypid(Optarg) } setProject(pid) # library.awk .. load Project[] paths via project.cfg # if -p not given, use default noted in project.cfg delete IX loadtccfg() # Populate Tccfg[] from Project["tccfg"] ie. .tc.cfg generated by mkcfg webserver() } # _________________________ web server, static HTML, CGI _______________________________________________________ # # Web server # Credit: adapted from GNU Awk manual # function webserver() { MyHost == "" ? MyHost = "localhost" : "" MyPort == "" ? MyPort = "8080" : "" printf("Webserver at http://%s:%s\nOK\n",MyHost,MyPort) HttpService = "/inet/tcp/" MyPort "/0/0" URLPrefix = "http://" MyHost ":" MyPort while ("scroll" != "codex") { # Loop web server running RS = ORS = "\r\n" Status = 200 # OK Reason = "OK" Header = MetaHeader() Document = PageHeader() Footer = PageFooter() if(GETARG["status"] == "") { if( ! loadnextGETARG() ) { print "Condition uncertain. All names processed already? Exiting without save." exit } } if (GETARG["Method"] == "GET") { if(GETARG["status"] != "") { if(GETARG["status"] == "quit") { writetccfg() Document = PageHeader() " " QuitPage() done = 1 } else { changestatus(GETARG["name"], GETARG["status"]) if( ! loadnextGETARG() ) { writetccfg() Document = PageHeader() " " QuitPage() done = 1 } } } GETARG["page"] == "" ? GETARG["page"] = 1 : "" GETARG["name"] == "" ? GETARG["name"] = "unknown" : "" GETARG["query"] != "" ? Document = sprintf("%s\n%s\n", PageHeader(), search_ia(GETARG["query"], GETARG["page"], GETARG["sort"], GETARG["name"])) : "" } else if (GETARG["Method"] == "HEAD") { # not yet implemented } else if (GETARG["Method"] != "") { print "bad method", GETARG["Method"] } Prompt = Header Document Footer print "HTTP/1.0", Status, Reason |& HttpService print "Connection: Close" |& HttpService print "Pragma: no-cache" |& HttpService len = length(Prompt) + length(ORS) print "Content-length:", len |& HttpService print ORS Prompt |& HttpService if(done) { printf("Webserver at http://%s:%s\nQuit\n",MyHost,MyPort) exit } # ignore all the header lines while ((HttpService |& getline) > 0) # Loop wait for click ; # stop talking to this client close(HttpService) # wait for new client request HttpService |& getline # do some logging print systime(), strftime(), $0 # CGI_setup() CGI_setup($1, $2, $3) } } # # Parse CGI environment into global GETARG[] array. # function CGI_setup(method, uri, version, i) { delete GETARG; delete MENU; delete PARAM GETARG["Method"] = method GETARG["URI"] = uri GETARG["Version"] = version i = index(GETARG["URI"], "?") # is there a "?" indicating a CGI request? if (i > 0) { split(substr(GETARG["URI"], 1, i-1), MENU, "[/:]") split(substr(GETARG["URI"], i+1), PARAM, "&") for (i in PARAM) { j = index(PARAM[i], "=") GETARG[substr(PARAM[i], 1, j-1)] = substr(PARAM[i], j+1) } } else { # there is no "?", no need for splitting PARAMs split(GETARG["URI"], MENU, "[/:]") } } # # Static HTML # function MetaHeader( str) { str = "\ Internet Archive Works on Wikipedia\ \ \n\ \n\ \n" return str } function PageHeader( str) { str = "\

\ article " Count["processed"] " of " Count["total"] "

Internet Archive Works on Wikipedia

\n" return str } function PageFooter( str){ str = "\n" return str } function SavePage( str) { str = "\

\n\ Saved. Press back-button to keep working.
\ Output file: " Project["tccfg"] return str } function QuitPage( str) { str = "\

\n\ Script has completed running.\ Output file: " Project["tccfg"] return str } # _________________________ core function _______________________________________________________ # # Search Internet Archive and format results. # function search_ia(entity, pagenum, sort, name ,head,tail,url,xml,c,doc,page,arr,arrg,arrgh,numfound,numpages,myfloat,qin,ipage,zpage,G,i,visible,first,second,str,pageindex,pbon,pbtot,ad,ac,tempid) { if ( sort == "-publicdate") sort = "publicdate+desc" if ( sort == "publicdate") sort = "publicdate+asc" if ( sort == "-date") sort = "date+desc" if ( sort == "date") sort = "date+asc" head = "http://archive.org/advancedsearch.php?q=" tail = "&fl[]=date&fl[]=creator&fl[]=description&fl[]=downloads&fl[]=identifier&fl[]=mediatype&fl[]=subject&fl[]=title&sort[]=" sort "&sort[]=&sort[]=&rows=" Rows "&output=xml&callback=callback&save=yes&page=" pagenum url = head entity tail # Download XML from Internet Archive xml = http2var(url) if ( xml == "" || length(xml) == 0 ) return "Error in function search_ia: Unable to retrieve data from Internet Archive." if(length(xml) < 100) { # Bug in Internet Archive. print "Warning: XML bug. Using build_xml()" > "/dev/stderr" xml = build_xml(head,tail,entity,Rows,sort,pagenum) } gsub(/</,"<",xml);gsub(/>/,">",xml);gsub(/"/,"\"",xml);gsub(/&/,"\\&",xml) tempid = whatistempid( gensub("_"," ","g",name), Project["index"]) G["thumbnail"] = wikimainimage(name) # Show first paragraph and status button choices page = page "

" name "" if(G["thumbnail"]) page = page sprintf("\n

", G["thumbnail"]) page = page wikilead(name) page = page "

Reject \ Accept \ Manual \ Quit

" c = split(xml, doc, "(|)") # numFound="432" match(xml, "numFound=\"[0-9]+\"", arr) split(arr[0], arrg, "\"") numfound = arrg[2] myfloat = numfound / Rows if ( myfloat ~ "[.]") { split(myfloat, arr, ".") numpages = arr[1] + 1 } else numpages = myfloat # name="qin">text< (search string as reported by IA) split(doc[1], arr, "(name=\"qin\">)") split(arr[2], arrg, "") qin = arrg[1] numfound > 0 ? first = (pagenum * Rows) - (Rows - 1) : first = 0 (pagenum * Rows) > numfound ? second = numfound : second = pagenum * Rows init_searchtokens(qin) # Parse search words for highlighting # page = page "\n

Search Results:

" page = page "\n

" CurrentID "

" page = page "\n

Results: " first " through " second " of " numfound " (" numpages " pages)
" page = page "\nYou searched for: " qin "

" # page index bar if ( numpages > 1 ) { pageindex = "\n

" pbon = pageblock(pagenum, numpages, "on") pbtot = pageblock(pagenum, numpages, "total") pbon == 1 ? ipage = 0 : ipage = ((int(pbon) * int(PBSize)) - (int(PBSize))) pbon == pbtot ? zpage = numpages : zpage = ipage + (int(PBSize) ) # "Prev" if ( pbtot > 1 && pbon != 1) { if ( RunType == "cgi" ) pageindex = pageindex "Prev" else if ( RunType == "standalone" ) pageindex = pageindex " Prev" else pageindex = pageindex " error" } while ( ipage++ < zpage ) { if(ipage != pagenum) { if ( RunType == "cgi" ) pageindex = pageindex "" ipage "" else if ( RunType == "standalone" ) pageindex = pageindex "" ipage "" else pageindex = pageindex " error" } else pageindex = pageindex "[" ipage "]" ipage != numpages ? pageindex = pageindex " " : "" } # "Next" if ( pbtot > 1 && pbon != pbtot ) { if ( RunType == "cgi" ) pageindex = pageindex "Next" else if ( RunType == "standalone" ) pageindex = pageindex " Next" else pageindex = pageindex " error" } pageindex = pageindex "

" } page = page pageindex i = visible = 0 while(i < (Rows * 2) ) { i = i + 2 delete G # identifier (name="identifier">presidentgarfiel00hinsuoft<) match(doc[i], "name=\"identifier\">[^<]+<", arr) split(arr[0],arrg,"(<|>)") G["identifier"] = arrg[2] if ( G["identifier"] == "" ) continue #Skip if no id else visible++ # creator and subject (if multiple names they are surrounded by and each enclosed by a - somtimes multiple names are just plain text comma separated) ad = 0 ac = split(doc[i], arr, "\n") # possible point of failure if XML format ever stoped having a /n after each line while(ad++ < ac) { if(arr[ad] ~ // ) { gsub(//,"",arr[ad]) gsub(/<[/]arr>/,"",arr[ad]) split(strip(arr[ad]), arrg, "|") G["creatorfirst"] = hightokens(strip(arrg[2])) G["creator"] = strip(arr[ad]) gsub(/<\/str>/,"; ",G["creator"]) gsub(/<\/str>|/,"",G["creator"]) G["creator"] = hightokens(G["creator"]) continue } if(arr[ad] ~ // ) { gsub(//,"",arr[ad]) gsub(/<\/str>/,"",arr[ad]) G["creator"] = hightokens(strip(arr[ad])) G["creatorfirst"] = G["creator"] continue } if(arr[ad] ~ // ) { gsub(//,"",arr[ad]) gsub(/<\/arr>/,"",arr[ad]) G["subject"] = strip(arr[ad]) gsub(/<\/str>/,"; ",G["subject"]) gsub(/<\/str>|/,"",G["subject"]) G["subject"] = hightokens(G["subject"]) continue } if(arr[ad] ~ // ) { gsub(//,"",arr[ad]) gsub(/<\/str>/,"",arr[ad]) G["subject"] = hightokens(strip(arr[ad])) continue } } # description (name="description">text<) split(doc[i], arr, "(") G["description"] = hightokens(substr(striphtml(substr(arrg[1], 2)), 1, 320)) # Only show first 320 characters of description field # downloads (name="downloads">638<) match(doc[i], "name=\"downloads\">[0-9]+<", arr) split(arr[0],arrg,"(<|>)") G["downloads"] = arrg[2] # date (name="date">1881-01-01T00:00:00Z) match(doc[i], "name=\"date\">[^<]+<", arr) split(arr[0],arrg,"(<|>)") split(arrg[2],arrgh,"-") G["date"] = arrgh[1] # texts (name="mediatype">texts<) match(doc[i], "name=\"mediatype\">[^<]+<", arr) split(arr[0],arrg,"(<|>)") G["mediatype"] = arrg[2] # title (name="title">Title<) match(doc[i], "name=\"title\">[^<]+<", arr) split(arr[0],arrg,"(<|>)") G["title"] = hightokens(arrg[2]) # cover image G["image"] = "//www.archive.org/download/" G["identifier"] "/page/cover_thumb.jpg" # display works page = page "

" if ( G["mediatype"] == "texts") page = page sprintf("\n\n $\"texts\"$ ") else if ( G["mediatype"] == "image") page = page sprintf("\n\n $\"image\"$ ") else if ( G["mediatype"] == "video" || G["mediatype"] == "movies") page = page sprintf("\n\n $\"video\"$ ") else if ( G["mediatype"] == "audio" || G["mediatype"] == "etree") page = page sprintf("\n\n $\"audio\"$ ") else if ( G["mediatype"] == "software") page = page sprintf("\n\n $\"software\"$ ") else page = page sprintf("\n\n $\"collection\"$ ") page = page sprintf("\n

", G["image"]) page = page sprintf("\n%s - %s

", G["identifier"], G["title"], G["creatorfirst"]) page = page "

" G["description"] ? page = page sprintf("\n%s
", G["description"]) : "" G["date"] ? page = page sprintf("\nDate: %s
", G["date"]) : "" G["creator"] ? page = page sprintf("\nCreator: %s
", G["creator"]) : "" if(G["subject"]) { if ( RunType == "cgi" ) page = page sprintf("\nKeywords: %s
", URLPrefix, MyProg, urlencodeawk("subject:\"" G["subject"] "\""), G["subject"]) else if ( RunType = "standalone" ) page = page sprintf("\nKeywords: %s
", URLPrefix, MyProg, urlencodeawk("subject:\"" G["subject"] "\""), G["subject"]) else page = page " error" } G["downloads"] ? page = page sprintf("\nDownloads: %s
", G["downloads"]) : "" page = page "

" i != (Rows * 2) ? page = page sprintf("\n

\n") : "" } ! visible ? page = page "

No results.

" : page = page pageindex return page } # # Break total number of pages into blocks of PBSize (user defined) and return which block "pagenumber" belongs to. # Also an option (command = "total") to return total number of blocks. # function pageblock(pagenumber, numberofpages, command ,myfloat, arr, numberofblocks, i, calc) { myfloat = int(numberofpages) / int(PBSize) if ( myfloat ~ "[.]") { split(myfloat, arr, ".") numberofblocks = int(arr[1]) + 1 } else numberofblocks = myfloat if ( command == "total") { if ( numberofblocks > 1) return numberofblocks else return 1 } if ( command == "on") { while ( i++ < int(numberofblocks) ) { if ( int(pagenumber) <= i * int(PBSize) ) return i } return 1 } } # _________________________ highlight search tokens _______________________________________________________ # # highlight search tokens # function hightokens(str ,c,arr,i,t,out,build,work) { c = split(str, arr, " ") while(i < c) { i++ out = "" work = arr[i] gsub("([,]|^[(]|[)]$|[)][,]$|\"|[.]$|\"[.]$|'[.]$|`[.]$|;|[:]|^'|'$|^`|`$|'s$|'s[.]$)","",work) # <-- Customize search token exceptions for ( t in Tokens ) { gsub("([,]|^[(]|[)]$|[)][,]$|\"|[.]$|\"[.]$|'[.]$|`[.]$|;|[:]|^'|'$|^`|`$|'s$|'s[.]$)","",t) if ( tolower(t) == tolower(work) ) { out = "" arr[i] "" break } } out == "" ? out = arr[i] : "" i == 1 ? build = out : build = build " " out } return build } function init_searchtokens(iaquery ,arr, arrg, i, k, c) { delete Tokens gsub("\"","",iaquery) gsub("+"," ",iaquery) split(iaquery, arr, "+") k = join(arr, 1, len2(arr), " ") c = split(k, arrg, " ") while ( i++ < c) { if ( arrg[i] !~ "(AND|OR|:)" ) { # <-- Customize tokens to ignore eg. logic statements, etc.. gsub(/[,]+$/,"",arrg[i]) gsub(/[)]+$/,"",arrg[i]) gsub(/^[(]+/,"",arrg[i]) Tokens[arrg[i]] = arrg[i] } } } # # Build a passable XML file due to bug in Internet Archive (in some cases) triggered by unknown reasons. # 1. create a fake XML header # 2. download list of identifier's in CSV format (this works for some reason but XML doesn't) # 3. for each identifier, request *individual* XML (one for each work) and extract the portion between # 4. build a complete XML from the parts # function build_xml(head,tail,entity,rows,sort,pagenum, csv,out,subxml,sc,doc,a,i,k) { qin = urlendecode(entity,"decode") # XML header out = "" "\n" out = out "" "\n" out = out "" "\n" out = out "0" "\n" out = out "89" "\n" out = out "" "\n" out = out "" "\n" out = out "xml" "\n" out = out "" rows "" "\n" out = out "" qin "" "\n" out = out "collection,creator,date,description,downloads,identifier,mediatype,publisher,title,year" "\n" out = out "0" "\n" out = out "" "\n" out = out "" "\n" out = out "" "\n" # Get CSV version (only content type that works) tail = "&fl[]=identifier&rows=" rows "&output=csv&callback=callback&save=yes&page=" pagenum url = head entity tail csv = http2var(url) tail = "&fl[]=date&fl[]=publisher&fl[]=collection&fl[]=creator&fl[]=description&fl[]=downloads&fl[]=identifier&fl[]=mediatype&fl[]=subject&fl[]=title&sort[]=" sort "&sort[]=&sort[]=&rows=1&output=xml&callback=callback&save=yes&page=" pagenum c = split(csv, a, "\n") while(i++ < c) { if(i == 1) continue # Skip CSV header gsub(/"/,"",a[i]) # Remove quotes around CSV field entity = "identifier%3A%22" a[i] "%22" url = head entity tail subxml = http2var(url) gsub(/</,"<",subxml);gsub(/>/,">",subxml);gsub(/"/,"\"",subxml);gsub(/&/,"\\&",subxml) sc = split(subxml, doc, "(|)") if(length(doc[2]) > 10) { out = out "" "\n" out = out doc[2] out = out "" "\n" } } out = out "" "\n" out = out "" "\n" gsub(/numFound="15"/,"numFound=\"" c - 1 "\"",out) return out } # _________________________ utilities _______________________________________________________ # # Populate GETARG["query"] and GETARG["name"] from Tccfg[] # function loadnextGETARG( a,b,o,tid,tmpof) { PROCINFO["sorted_in"] = "@ind_num_asc" for(o in Tccfg) { if(Tccfg[o]["status"] == "unknown") { GETARG["name"] = Tccfg[o]["name"] # print GETARG["name"] tid = whatistempid(GETARG["name"], Project["index"]) # print "tid = " tid tmpof = readfile(tid Tccfg[o]["id"]) split(tmpof,a," ") GETARG["query"] = gensub(/\[https?[:]\/\/archive[.]org\/search[.]php\?query\=/,"","g",a[1]) split(Tccfg[o]["id"], b, "@") CurrentID = "sname=" gensub(/_/," ", "g", b[2]) " |sopt=" b[3] return 1 } } return 0 } # # Change the status field in Tccfg[] # function changestatus(name, status, o,re,out) { for(o in Tccfg) { re = "^" regesc(strip(name)) "$" if(tolower(Tccfg[o]["encode"]) ~ tolower(re)) { if(Tccfg[o]["status"] ~ /unknown/) Count["processed"]++ Tccfg[o]["status"] = status writetccfg() if(Tccfg[o]["tempid"]) { # Backup the choice (see also "Backup the file" below) out = Tccfg[o]["tempid"] "tc.cfg.bak" print status > out close(out) } return } } } # # Populate Tccfg[] from Project["tccfg"] ie. .tc.cfg generated by mkcfg # function loadtccfg( a,b,c,i,o) { delete Tccfg delete Count Count["processed"] = 1 print "Reading tc.cfg and index files.." checkexists(Project["tccfg"], "iawow.awk loadtccfg()", "exit") c = split( readfile(Project["tccfg"]), a, "\n") while(i++ < c) { if(split(a[i],b,"|") == 4) { Tccfg[i]["name"] = b[1] Tccfg[i]["id"] = b[2] Tccfg[i]["status"] = b[3] Tccfg[i]["encode"] = b[4] Tccfg[i]["tempid"] = whatistempid( gensub("_"," ","g",Tccfg[i]["name"]), Project["index"]) Count["total"]++ } } print "OK" for(o in Tccfg) if(Tccfg[o]["status"] !~ /^unknown$/) Count["processed"]++ } # # Write Tccfg[] back out to the file # function writetccfg( command,o) { close(Project["tccfg"]) command = Exe["rm"] " " Project["tccfg"] sys2var(command) system("") if(checkexists(Project["tccfg"])) { print "writetccfg(): Unable to delete " Project["tccfg"] exit } PROCINFO["sorted_in"] = "@ind_num_asc" for(o in Tccfg) { printf("%s|%s|%s|%s\n",Tccfg[o]["name"], Tccfg[o]["id"], Tccfg[o]["status"], Tccfg[o]["encode"]) >> Project["tccfg"] } close(Project["tccfg"]) # Backup the file (see also "Backup the choice" above) command = Exe["cp"] " " Project["tccfg"] " " Project["tccfg"] ".bak" sys2var(command) system("") } # # Return the path/tempid of a name (eg. /home/adminuser/wi-awb/temp/wi-awb-0202173111/) # function whatistempid(name, filepath, b,i,re) { if( ! length(IX) ) { checkexists(filepath, "iawow.awk whatistempid()", "exit") IXC = split(readfile(filepath),IX,"\n") } re = "^" regesc(strip(name)) "$" # Remove ["]name["] which was striped previously in ia9out-index gsub(/[[]["][]]/,"",re) # print re while(i++ < IXC) { split(IX[i], b, "|") if(strip(b[1]) ~ re) return strip(b[2]) } return 0 } # # Return first paragraph of lead section of article.txt # function getfirstlines(name, tempid, a,c,i,lastnamepage,s1,s2,firstlet,topoflead) { topoflead = "no" c = split(gensub(/[(][^\)]*[)]$/, "", "g", name), a, " ") lastnamepage = a[c] c = split(readfile(tempid "article.txt"), a, "\n") while(i++ < c) { s1 = a[i] if(match(s1,"==")) break firstlet = substr(s1,1,1) if(match(firstlet,"([|]|[:]|[{]|[}]|[[])")) continue s2 = stripwikimarkup(s1) if( match(s2,lastnamepage) ) { if(match(s2,"[(][^)]+[)]") ) return s2 } } return "" } # # Length of an array. Portable function for older versions of gawk # function len2(array, i) { i = 1 while (i in array) { i++ } return i - 1 } # # strip HTML (method has general limitations but OK for this app) # function striphtml(s) { gsub (/<[^>][^>]*>/, "", s) return s }