#!/usr/local/bin/gawk -bE # # awsexp.awk searches for AWS links to make sure they have a web archive, before they expire (if not already expired) # # https://en.wikipedia.org/w/index.php?search=insource%3A%2F%5C%26Expires%3D%5B0-9%5D%7B10%7D%5C%26%2F&title=Special%3ASearch&profile=advanced&fulltext=1&ns0=1 # # Expires=[0-9]{10} is a unix timestamp # # The MIT License (MIT) # # Copyright (c) May 2021 User:GreenC at en.wikipedia.org # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # library.awk is avail from https://github.com/greencardamom/BotWikiAwk/tree/master/lib @include "library.awk" # # datefourteen() - return current date and time: 20170610T010101 # # Requirement: Exe["date"] # function datefourteen() { if ( ! checkexe(Exe["date"], "date") ) return return sys2var( Exe["date"] " +\"%Y%m%dT%H%M%S\"") } # # Add a new entry to list # function addList(page, mainPage,field,sep,fp,c,i) { fp = sys2var(Exe["wikiget"] " -w " shquote(page)) c = patsplit(fp, field, /https?:\/\/[^ }|<\]\n]+[^ }|<\]\n]/, sep) for(i = 1; i <= c; i++) { if(field[i] ~ /&Expires=[0-9]{10}[&]/) { L[page]["page"] = page if(empty(L[page]["urls"])) L[page]["urls"] = strip(field[i]) else L[page]["urls"] = L[page]["urls"] " " strip(field[i]) } } if(c = 0 || empty(L[page]["urls"]) ) { print(dateeight() " ---- Unable to find URL(s) ---- " page) >> Logerror return 0 } return 1 } # # Check if page exists in list.txt # function inList(page, k) { for(k in L) { if(L[k]["page"] == page) return 1 } return 0 } # # Check if page + url exists in Logfile # function inSPN(page,url, k) { for(k in LS) { if(k == page " " url) return 1 } return 0 } # # Load from list.txt to L[][] # function loadList(file, a,b,page,urls,i) { if(!checkexists(file)) return for(i = 1; i <= splitn(file, a, i); i++) { if(split(a[i], b, " ---- ") == 2) { page = strip(b[1]) urls = strip(b[2]) if(!empty(page) && !empty(urls)) { L[page]["page"] = page L[page]["urls"] = urls } } } } # # Load from Logfile to LS[][] # function loadSPN(file, a,b,page,url,i) { if(!checkexists(file)) return for(i = 1; i <= splitn(file, a, i); i++) { if(split(a[i], b, " ---- ") == 3) { page = strip(b[1]) url = strip(b[3]) url = subs("/usr/bin/timeout 5m /usr/bin/wget -q -O- 'https://web.archive.org/save/", "", url) sub(/'$/, "", url) url = strip(url) if(!empty(page) && !empty(url)) { LS[page " " url]["page"] = page LS[page " " url]["url"] = url } } } } # # Save L[][] to list.txt # function saveList(file) { if(checkexists(file)) sys2var(Exe["rm"] " -r " file) for(page in L) print L[page]["page"] " ---- " L[page]["urls"] >> file } # # Issue a SavePageNow at archive.today (and/or Wayback) if URL is new and not already expired. # Add entry to Logfile. # function savepagenow(page, command,a,i,c,url,d,newurl,res,now,expires,savemsg) { c = split(L[page]["urls"], a, " ") for(i = 1; i <= c; i++) { url = strip(a[i]) if(! inSPN(page, url)) { if(match(url, /Expires=[^&]+[^&]/, expires) > 0) { sub(/Expires=/, "", expires[0]) # Expires unix time now = sys2var(Exe["date"] " +\"%s\"") # now in unix-time if(int(now) < int(expires[0])) { # captures don't work at Wayback or Archive.today? # Wayback - might work with PDFs also could be slow. command = Exe["timeout"] " 5m " Exe["wget"] " -q -O- " shquote("https://web.archive.org/save/" url) sys2var(command) savemsg = "[saved at wayback] " # Archive.today # command = Exe["timeout"] " 5m " Home "archivetoday.bsh " shquote(url) # res = sys2var(command) # newurl = res # if(match(res, /document[.]location[.]replace[(]["]https:\/\/archive[.]is\/wip\/[^"]+["]/, d) > 0) { # gsub(/(document[.]location[.]replace[(]["]|["]$|[/]wip)/, "", d[0]) # newurl = d[0] # } print(page " ---- " datefourteen() " ---- " url " ---- " expires[0] " ---- " now " ---- unexpired" ) >> Logfile # sys2var(Exe["mailx"] " -s " shquote("NOTIFY: " BotName "(" Hostname "." Domain ") Found [unexpired] " savemsg "Expires= URL in: " page) " " P["email"] " < /dev/null") email(Exe["from_email"], Exe["to_email"], "NOTIFY: " BotName "(" Hostname "." Domain ") Found [unexpired] " savemsg "Expires= URL in: " page, "") } else { print(page " ---- " datefourteen() " ---- " url " ---- " expires[0] " ---- " now " ---- expired" ) >> Logfile # sys2var(Exe["mailx"] " -s " shquote("NOTIFY: " BotName "(" Hostname "." Domain ") Found [expired] Expires= URL in: " page) " " P["email"] " < /dev/null") email(Exe["from_email"], Exe["to_email"], "NOTIFY: " BotName "(" Hostname "." Domain ") Found [expired] Expires= URL in: " page, "") } } } } } function main( search,a,i,watch,b,x) { loadList(Listtxt) # ..into L[][] loadSPN(Logfile) # ..into LS[][] # Search mainspace, File: and Draft: # Note: insource search sometimes "flaps" ie. for a given URL it sometimes reports found othertimes not. Thus the need # to check the Logfile if it was previously saved. search = sys2var(Exe["wikiget"] " -a \"insource:" shquote("&Expires") " insource:/\\&Expires=[0-9]{10}\\&/\"") if(empty(search)) { stdErr("Empty search result - aborted awsexp.awk") exit } if(Debug) stdErr(datefourteen() " ---- number of pages in search ---- " split(search, a, "\n")) # Add to list.txt if not already in list.txt for(i = 1; i <= splitn(search "\n", a, i); i++) { a[i] = strip(a[i]) if(!empty(a[i])) { if(inList(a[i]) == 0) { # if not in list.txt addList(a[i]) savepagenow(a[i]) print(a[i] " ---- " datefourteen() " ---- add") >> Logsyslog x++ } } } # Remove from list.txt if not in search for(b in L) { watch = 0 for(i = 1; i <= length(a); i++) { if(L[b]["page"] == a[i]) watch = 1 } if(!empty(b) && watch == 0) { print(L[b]["page"] " ---- " datefourteen() " ---- remove") >> Logsyslog delete L[b] x++ } } if(int(x) > 0) { saveList(Listtxt) # Why push? sys2var(Exe["push"] " awsexp") } } BEGIN { BotName = "awsexp" Hostname = "en" Domain = "wikipedia.org" IGNORECASE = 1 Home = "/home/greenc/toolforge/awsexp/" # wikiget avail at https://github.com/greencardamom/Wikiget Exe["wikiget"] = "/home/greenc/scripts/wikiget" Exe["timeout"] = "/usr/bin/timeout" Exe["wget"] = "/usr/bin/wget" Exe["date"] = "/usr/bin/date" Exe["rm"] = "/bin/rm" Exe["chmod"] = "/bin/chmod" Exe["mailx"] = "/usr/bin/mailx" Exe["grep"] = "/bin/grep" Exe["cat"] = "/bin/cat" Exe["push"] = "/home/greenc/toolforge/scripts/push" # Cfg for email() in library.awk -- more documentation there # This is for SMTP via curl - optionally you can use mailx - uncomment lines in this file Exe["curl"] = "/usr/bin/curl" Debug = 0 Listtxt = Home "www/list.txt" Logfile = Home "www/logsavepage.txt" Logerror = Home "www/logerrors.txt" Logsyslog = Home "www/logsyslog.txt" main() }