# Search entire Wikipedia database # Download Wikipedia (English): https://en.wikipedia.org/wiki/Wikipedia:Database_download#English-language_Wikipedia # Install mawk. Binary for Cygwin: http://artfiles.org/cygwin.org/pub/cygwinports/x86/release/mawk/ # BEGIN { r = "/f/t/wikipedia-dump/enwiki-20150515-pages-articles.xml" RS=("") while ((getline rawstr < r ) > 0) { gsub(/</,"<",rawstr);gsub(/>/,">",rawstr);gsub(/"/,"\"",rawstr);gsub(/&/,"\\&",rawstr) # Article title if ( match(rawstr, ".+") ) { a = substr(rawstr, RSTART, RLENGTH) split(a, b, "(|)") title = b[2] } # Article body if ( match(rawstr, ".+") ) { a = substr(rawstr, RSTART, RLENGTH) split(a, b, "(|)") document = b[2] } # ---------- Find all cases of Books and Writers at the Kuusankoski Public Library in Finland ----- if ( match(document, "sportsillustrated.cnn.com") ) { print title #system("") # flush buffer continue } } close(r) }