Multiple Database: Sample Whoosh Search

#!/usr/bin/env python3 # -*- coding: utf-8 -*- import codecs import sys import re import cgi import cgitb cgitb.enable() # for troubleshooting from whoosh.index import open_dir from whoosh import qparser from whoosh.qparser import * from whoosh.query import * from decimal import Decimal import unidecode print("Content-type: text/html;charset=utf-8\n\n") whooshindex2use = "/var/www/html/mark/hub/whoosh7dbindexallrev3/" header = """ Multiple Database: Sample Whoosh Search

""" print(header) print("""

""") print("Using Index: " + whooshindex2use + "
\n") # print ("

ENCODING: " + sys.stdout.encoding + "

\n") thisencoding = sys.stdout.encoding baseurl = dict() philosearchbaseurl = dict() philodbtype = dict() numphiloid = dict() baseurl["marat"] = '\n") searchlimit = True if periodlimit: print("Search Limited to Period: " + periodlimit + "
\n") searchlimit = True searchwordsforwhoosh = "" if searchwords: searchwordsforwhoosh = re.sub("[^\w ]", "", searchwords) print(f"Input Terms: {searchwords}") else: print("Input some words!

") searchwords = "conspirateurs aristocrates ennemis etrangeres royalistes" # searchwords = "conspirateurs aristocrates ennemis etrangeres royalistes year:>=1789" searchwordsforwhoosh = searchwords print("Here are default search words: " + searchwordsforwhoosh + "
") search_index = open_dir(whooshindex2use) if opbind: og = qparser.OrGroup.factory(0.9) query_parser = QueryParser("content", search_index.schema, group=og) else: query_parser = QueryParser("content", search_index.schema) query_parser.add_plugin(GtLtPlugin()) with search_index.searcher() as searcher: parsed_query = query_parser.parse(searchwordsforwhoosh) print(f" => PARSED QUERY: {parsed_query}
") if searchlimit: if periodlimit and collectionlimit: allow_q = And([Term("philodbname", collectionlimit.lower()), Term("period", periodlimit)]) elif periodlimit: allow_q = And([Term("period", periodlimit)]) elif collectionlimit: allow_q = And([Term("philodbname", collectionlimit.lower())]) else: print("ERROR") results = searcher.search(parsed_query, filter=allow_q, limit=reslimit, terms=True) results.fragmenter.charlimit = None myrescount = len(results) myrescount = int(myrescount) myfiltcount = results.filtered_count thistres = myrescount - myfiltcount mytres = thistres print("
Your query returned ", thistres, "results. ") else: results = searcher.search(parsed_query, limit=reslimit, terms=True) results.fragmenter.charlimit = None mytres = len(results) print("
Your query returned ", mytres, "results. ") if mytres > reslimit: print("
Displaying top ", reslimit, "results.") print(" (Top 20 Author and Title Frequencies displayed at bottom)") print("
Links turned off at this time.") print("

") buffer = "" c = 0 for result_number, result in enumerate(results): myphilodbname = result["philodbname"] mvoaut = "" if myphilodbname == "baudouin": myaut = "[REVLAW], " mvoaut = "RevLaw" else: myaut = result["author"] if myaut: myaut = myaut + ", " mvoaut = myaut mytit = "" + result["title"] + " " mvot = result["title"] if mvot in titlecount: titlecount[mvot] += 1 else: titlecount[mvot] = 1 if mvoaut: if mvoaut in authorcount: authorcount[mvoaut] += 1 else: authorcount[mvoaut] = 1 mydat = "[" + result["date"] + "] " mydatraw = result["date"] myyear = result["year"] myphiloid = result["philoid"] myphiloid = re.sub(" ", "/", myphiloid) myphiloid = re.sub("/0", "", myphiloid) myident = result["filename"] temp = myident.split("/") myident = temp[len(temp) - 1] myfilename = myident myident = re.sub(".xml", "", myident) mylink = baseurl[myphilodbname] + myphiloid + '/table-of-contents/">' mysearchwords = searchwords mysearchwords = re.sub(" *$", "", mysearchwords) mysearchwords = re.sub(" ", ".?|", mysearchwords) mysearchwords = mysearchwords + ".?" # mysearchwords = re.sub('\^[\.0-9]* ', '.?|', mysearchwords) mysearchphiloid = re.sub("/", " ", myphiloid) temp = mysearchphiloid numofids = len(temp.split(" ")) if numphiloid[myphilodbname] == "yes": thissearch = philosearchbaseurl[myphilodbname] thissearch = re.sub("filename=", "", thissearch) if numofids > 1: if numofids > 2: newid = re.sub(" 1$", "", mysearchphiloid) if newid != mysearchphiloid: mysearchphiloid = newid temp = mysearchphiloid numofids = len(temp.split(" ")) thisid = "philo_div" + str(numofids - 1) + "_id" else: thisid = "philo_doc_id" mysearchlink = thissearch + thisid + "=%22" + mysearchphiloid + "%22&q=" + mysearchwords + '">Search' else: mysearchlink = philosearchbaseurl[myphilodbname] + myfilename + "&q=" + mysearchwords + '">Search' if myphilodbname == "frc": iaurl = '' + myident + " " else: myialink = myident + " " myscore = "Score: " + str(round(result.score, 2)) + " " c = result_number + 1 outline = '

' + str(c) + ": " if philodbtype[myphilodbname] == "div": try: tmphead = result["divhead"] except KeyError: myhead = "[NO DIV TITLE] " else: myhead = "" + result["divhead"] + " " try: mydivdate = result["divdate"] except KeyError: mydivdate = "NA" else: mydivdate = result["divdate"] if mydivdate: mydivdate = "[" + mydivdate + "] " else: mydivdate = "[NA] " mydivlink = myhead #mydivlink = baseurl[myphilodbname] + myphiloid + '">' + myhead + "" #outline = outline + mydivlink + mydivdate outline = outline + mydivlink + mydivdate # outline = outline + myaut + mylink + mytit + " " + mydat + myscore + " DocID: " + myialink outline = outline + myaut + mytit + " " + mydat + myscore + " DocID: " + myident if showsnippets: # outline = outline + mysearchlink + "

" + '

\n' outline = outline + "" + '
\n' print(outline) print("
".join(f"... {t}..." for t in result.highlights("content", top=5).split("..."))) print("

\n") else: # outline = outline + mysearchlink + "" + "

\n" outline = outline + "" + "\n" print(outline) print( """

Top 20 authors

", elem[0], " :", elem[1], "

Title frequencies

", elem[0], " :", elem[1], "

")