1
0
mirror of https://github.com/ssb22/indexer.git synced 2023-06-14 16:48:35 +00:00
indexer/ohi_online.py

322 lines
17 KiB
Python

#!/usr/bin/env python
# (works in both Python 2 and Python 3)
# Online HTML Indexer v1.34 (c) 2013-18,2020,2022 Silas S. Brown.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# See comments in ohi.py for what this is about.
# Although the offline files will also work ONline, in
# bandwidth-limited situations you might be better using
# this lookup CGI. This version can also take multiple
# adjacent anchors, giving alternate labels to the same
# fragment; there should not be any whitespace between
# adjacent anchors.
# Configuration
# -------------
# You can change these variables here, but if you do then
# it might be more difficult to upgrade to newer versions
# of this script. However, any file called ohi_config.py
# (in the current directory or 1 level up) will be read.
html_filename = "input.html" # set this to whatever
# - and when that file changes, this script will update
# files with that plus .index, .header and .footer
# (it might be a good idea to do a separate run of this
# script, from the command line, to perform that update,
# especially if you're on a slow machine and the webserver
# has a short timeout; note that the update does NOT have
# to be done on the same machine, as long as the resulting
# files can be copied across)
# (another speedup is to get a small wrapper script to
# import ohi_online; the compiled version can then be used
# after the first time)
alphabet = "abcdefghijklmnopqrstuvwxyz" # set to None for all characters and case-sensitive; any headings not containing ANY of these characters will be put in as-is anyway
# ignore_text_in_parentheses NOT available in the online version because it could make it impossible to fetch entries that differ from others only in parenthetical additions (unless you merge the entries, which might not be a good idea)
more_sensible_punctuation_sort_order = True
remove_utf8_diacritics = True # or False, for removing diacritics in index headings (not in main text);
# assumes UTF-8. (Letters with diacritics will be treated as though they did not have any.)
frontpage_lookup_prompt = "Lookup: "
shorter_lookup_prompt = "Lookup: "
lines_before = 5 ; lines_after = 10
max_show_more = 50 ; increment = 10
between_before_and_after = "<br>"
# For more compactness, try this instead:
# between_before_and_after = " | "
# (depends on what sort of data you have though)
# You can override these functions if you want:
def preprocess_result(markup): return markup
def links_to_related_services(query): return "" # e.g. "Here | <a href...>Somewhere else</a>"
code_to_run_when_DOM_changes = ""
# you can set this to any Javascript to run after our JS
# manages to change the DOM (on capable browsers), e.g. to
# fix some typography when browser support is detected
web_adjuster_extension_mode = False
# If set to True, this module's handle() will work - see
# Web Adjuster 'extensions' option for more details.
# If set to False, we just behave as a CGI script.
web_adjuster_extension_url = "http://example.org/ohi.cgi"
web_adjuster_extension_url2 = "http://localhost/ohi.cgi"
cgi_name = "ohi.cgi" # for rewriting <a href="#..."> links
# Where to find history:
# on GitHub at https://github.com/ssb22/indexer
# and on GitLab at https://gitlab.com/ssb22/indexer
# and on BitBucket https://bitbucket.org/ssb22/indexer
# and at https://gitlab.developers.cam.ac.uk/ssb22/indexer
# and in China: https://gitee.com/ssb22/indexer
# ------------------------------------------
# allow overrides:
import sys ; sys.path = ['.','..'] + sys.path
try: import ohi_config
except ImportError: ohi_config = None
if not web_adjuster_extension_mode:
import cgitb ; cgitb.enable() # remove this if you don't want tracebacks in the browser
import mmap, os, cgi, re
try: from urllib import quote # Python 2
except ImportError: from urllib.parse import quote # Python 3
if ohi_config:
ohi_config.quote = quote # so functions there can use it
from ohi_config import *
try: xrange
except: xrange = range # Python 3
def B(s):
if type(s)==type(u""): return s.encode('utf-8')
else: return s
def create_linemap(fName):
f = open(fName,"rb")
lm = LineMap(f.fileno(), 0, access=mmap.ACCESS_READ)
lm.f = f # ensure not closed by gc
return lm
class LineMap(mmap.mmap): # might fail in old Python versions where mmap isn't a class
def linesAround(self,txt,linesBefore,linesAfter):
"returns (before,line,after), up to numLines lines either side of the line appropriate for txt"
self.seek(self.bisect(txt))
linesBefore = sum(self.back_line() for i in xrange(linesBefore))
return [self.readline() for i in xrange(linesBefore)],self.readline(),[x for x in [self.readline() for i in xrange(linesAfter)] if x]
def bisect(self,txt,lo=0,hi=-1):
"returns pos of start of appropriate line"
txt = B(txt)
if hi==-1: hi=len(self)
elif hi <= lo:
# return self.lineStart(hi)
# amendment: if only the first few characters matched, it's possible that the PREVIOUS entry will match more characters (positioning is rarely helped by an inserted character, e.g. a pinyin shen/sheng confusion, and we probably want to draw more attention to the previous entries in this case, especially if the following entries are completely different e.g. 'shi'; TODO: could even do full 'first entry that matches as many characters as possible' logic)
ret = self.lineStart(hi)
if ret==0 or self[ret:ret+len(txt)]==txt: return ret # all characters match current line, or there are no previous lines
txt2 = txt
while len(txt2)>1 and not self[ret:ret+len(txt2)]==txt2: txt2 = txt2[:-1] # delete characters from the end until all that are left match current line
ret2 = self.lineStart(ret-1)
if self[ret2:ret2+len(txt2)+1]==txt[:len(txt2)+1]: return ret2 # return previous line if they match that as well
else: return ret
lWidth,uWidth = int((hi-lo)/2),int((hi-lo+1)/2)
lMid = self.lineStart(lo+lWidth)
lLine = self.lineAt(lMid)
if lLine < txt: return self.bisect(txt,lMid+len(lLine),hi)
else: return self.bisect(txt,lo,lMid)
def lineStart(self,pos):
return self.rfind(B("\n"),0,pos)+1 # (for start of file, rfind will return -1 so this+1 is still what we want)
def lineAt(self,pos):
self.seek(pos) ; return self.readline()
def back_line(self):
p = self.tell()
if not p: return 0
elif self[p-1:p]==B('\n'):
self.seek(self.lineStart(p-1))
else: self.seek(self.lineStart(p))
return 1
if alphabet and more_sensible_punctuation_sort_order: alphaOnly = lambda x: re.sub('([;,]);+',r'\1',''.join(c for c in x.lower().replace('-',' ').replace(',','~COM~').replace(';',',').replace('~COM~',';').replace(' ',';') if c in alphabet+',;')) # gives ; < , == space (useful if ; is used to separate definitions and , is used before extra words to be added at the start; better set space EQUAL to comma, not higher, or will end up in wrong place if user inputs something forgetting the comma)
elif alphabet: alphaOnly = lambda x: ''.join(c for c in x.lower() if c in alphabet)
elif more_sensible_punctuation_sort_order: alphaOnly = lambda x: re.sub('([;,]);+',r'\1',x.replace('-',' ').replace(',','~COM~').replace(';',',').replace('~COM~',';').replace(' ',';'))
else: alphaOnly = lambda x:x
def ST(x):
if type(x)==type(""): return x # Python 2
return x.decode('utf-8') # Python 3
if more_sensible_punctuation_sort_order: undo_alphaOnly_swap = lambda x:ST(x).replace(';',' ').replace(',',';')
else: undo_alphaOnly_swap = lambda x:x
def U(s):
if type(s)==type(u""): return s
return s.decode('utf-8')
def S(s):
if type(u"")==type(""): return s # Python 3
else: return s.encode('utf-8') # Python 2
if remove_utf8_diacritics:
_ao = alphaOnly ; import unicodedata
alphaOnly = lambda x: _ao(S(u''.join((c for c in unicodedata.normalize('NFD',U(x)) if not unicodedata.category(c).startswith('M')))))
def load(fName):
txt = create_linemap(fName)
try:
if os.stat(fName).st_mtime <= os.stat(fName+".index").st_mtime:
return txt,create_linemap(fName+".index"),open(fName+".header").read(),open(fName+".footer").read()
except OSError: pass
ret = {}
contentStart = 0 ; header="" ; tag = ""
altTags = []
for m in re.finditer(B(r'<a name="([^"]*)"></a>'),txt):
# First, output the content from the PREVIOUS tag:
if contentStart and contentStart==m.start():
# oops, previous tag has NO content, so treat it as an 'alternate heading' to the tag we're about to have:
altTags.append(tag)
else:
for ttag in [tag]+altTags:
tag2 = alphaOnly(ttag)
if not tag2: tag2 = ttag
if contentStart:
if not tag2 in ret: ret[tag2] = (ttag,[])
ret[tag2][1].append("\t"+str(contentStart)+"\t"+str(m.start()))
else: # we're on the first tag
assert not altTags
header=txt[:m.start()]
if type(u"")==type(""): header=header.decode('utf-8') # Python 3
altTags = []
# Now look at the new tag:
tag = m.group(1) ; contentStart = m.end()
if type(u"")==type(""): tag=tag.decode('utf-8') # Python 3
footer = txt[contentStart:]
if type(u"")==type(""): footer=footer.decode('utf-8') # Python 3
if not header.strip(): header='<html><head><meta name="mobileoptimized" content="0"><meta name="viewport" content="width=device-width"><script>if(window.matchMedia && window.matchMedia("(prefers-color-scheme: dark)").matches)document.write("<style>body { background-color: black; color: #c0c000; } a { color: #00b000; }</style>");</script></head><body>'
if not footer.strip(): footer = '</body></html>'
try: ret = ret.iteritems() # Python 2
except: ret = ret.items() # Python 3
ret = [tag2+"\t"+ttag+"".join(rest)+"\n" for tag2,(ttag,rest) in ret] ; ret.sort()
open(fName+".index","w").write("".join(ret))
open(fName+".header","w").write(header)
open(fName+".footer","w").write(footer)
return txt,create_linemap(fName+".index"),header,footer
if web_adjuster_extension_mode: cginame = web_adjuster_extension_url[web_adjuster_extension_url.rindex('/')+1:]
else:
cginame = os.sep+sys.argv[0] ; cginame=cginame[cginame.rindex(os.sep)+1:]
def queryForm(prompt): return "<form action=\""+cginame+"\">"+prompt+'<input type="text" name="q"><input type="Submit" value="OK"></form>'
def out(html="",req=None):
if not html: html='<script><!--\ndocument.forms[0].q.focus();\n//--></script>' # TODO: else which browsers need <br> after the </form> in the line below?
html = queryForm(shorter_lookup_prompt)+html
if req:
req.set_header('Content-type','text/html; charset=utf-8')
req.write(B(header+html+footer))
else: print ("Content-type: text/html; charset=utf-8\n\n"+header+html+footer)
def link(l,highl=""):
l,linkText,rest = U(l).split('\t',2) ; highl = U(highl)
mismatch = u""
while highl and not l.startswith(highl): highl,mismatch=highl[:-1],highl[-1]+mismatch
i = j = 0
for c in highl:
matched = (linkText[i]==c)
if matched or (alphabet and not linkText[i] in alphabet and not linkText[i] in l):
i += 1
if matched: j = i
else: break
if j:
matchedPart,nextPart = linkText[:j],linkText[j:]
if nextPart and not nextPart.startswith(" ") and mismatch and not mismatch.startswith(" "): # show a red border around the mismatched letter to reinforce what happened (but ensure it's a border, not font colour, because we don't know what the user's background colour is)
nextPart="<span style=\"border: thin red solid\">"+nextPart[0]+"</span>"+nextPart[1:]
linkText = '<b>'+matchedPart+'</b>'+nextPart
l,linkText=S(l),S(linkText)
return '<a href="'+cginame+'?q='+quote(undo_alphaOnly_swap(l))+'&e=1" onclick="return tryInline(this)">'+linkText+'</a>' # (this gives a 'click to expand/collapse' option on browsers that support it, TODO: configurable? option to have onMouseOver previews somewhere?? careful as could run into trouble with user CSS files)
# (Could shorten l to the shortest unique part of the word, but that might not be a good idea if the data can change while users are online)
def redir(base,rest,req=None):
if not base:
if web_adjuster_extension_mode: base = web_adjuster_extension_url
else: base=os.environ.get("SCRIPT_URI",cginame) # cginame would make it a relative redirect, which might or might not work with the browser/server
if req:
req.set_status(302)
req.set_header("Location",base+rest)
return
print ("Status: 302") # TODO: check this works on all servers
print ("Location: "+base+rest)
print ("")
def linkSub(txt): return re.sub(r'(?i)<a href=("?)#',r'<a href=\1'+cgi_name+'?e=1&q=',ST(txt))
def main(req=None):
if req: query = req.request.arguments
elif web_adjuster_extension_mode:
load(html_filename)
sys.stderr.write("Index is now up-to-date\n")
return
else: query = cgi.parse()
def qGet(k,default=""):
v = query.get(k,default)
if type(v)==list: v=v[0]
if type(v)==str: v=v.strip() # TODO: or just .lstrip() ? (accidental spaces entered on mobile devices)
return v
q = qGet("q")
a = int(qGet("a",lines_after))
b = int(qGet("b",lines_before))
e = qGet("e")
if q and not e and a==lines_after and b==lines_before and not query.get("t",""): return redir("","?q="+quote(undo_alphaOnly_swap(q))+"&t=1#e",req=req)
global header,footer
txt,index,header,footer = load(html_filename)
if not q: return out(req=req)
q,q0 = alphaOnly(q),q
if not q: q = q0
if e:
ranges = ST(index.linesAround(q,0,0)[1]).split("\t")[2:]
toOut = preprocess_result("<hr>".join(linkSub(txt[int(a):int(b)]) for a,b in zip(ranges[::2], ranges[1::2])))
if e=="2":
if req:
req.set_header('Content-type','text/plain; charset=utf-8')
req.write(toOut)
else: print ("Content-type: text/plain; charset=utf-8\n\n"+toOut) # for the XMLHttpRequest
return
else: return out(toOut,req=req)
b4,line,aftr = index.linesAround(q,b,a)
lnks = links_to_related_services(q0)
if lnks: lnks += '<hr>'
def more(a,b,tag,label): return ('<a name="%s" href="%s?q=%s&a=%d&b=%d#%s">%s</a>' % (tag,cginame,quote(undo_alphaOnly_swap(q)),a,b,tag,label)) # 'after' version of this works only if it's at the very bottom of the page, so the words above it are still on-screen when jumping to its hash
if b < max_show_more and len(b4)==b: moreBefore = more(a,min(b+increment,max_show_more),"b","&lt;&lt; more")+between_before_and_after
else: moreBefore = '<a name="b"></a>'
if a < max_show_more and len(aftr)==a: moreAfter = between_before_and_after+more(min(a+increment,max_show_more),b,"a","more &gt;&gt;")
else: moreAfter = '<a name="a"></a>'
if not '<' in between_before_and_after: tableStyle,tableAround = ' style="display:inline-table"',between_before_and_after
else: tableStyle,tableAround = "",""
out(lnks+moreBefore+"""<script><!--
function tryInline(l) { l.onclick=function(){return false}; if(!(XMLHttpRequest&&l.innerHTML)) return true; var n=document.createElement("div"); l.parentNode.insertBefore(n,l.nextSibling); n.innerHTML="Loading"; if(n.innerHTML!="Loading") return true; n.setAttribute("style","border:thin blue solid"); function g(h){l.myStuff=h;n.innerHTML=h;if(l.parentNode.nodeName=='TD') l.parentNode.parentNode.parentNode.parentNode.style.display='block';l.onclick=function(){l.parentNode.removeChild(n);if(l.parentNode.nodeName=='TD') l.parentNode.parentNode.parentNode.parentNode.style.display='inline-table';l.onclick=function(){return tryInline(l)};return false};"""+code_to_run_when_DOM_changes+"""}; if(l.myStuff) g(l.myStuff);else{var req=new XMLHttpRequest();req.open("GET",l.href.replace("&e=1","&e=2"),true);req.onreadystatechange=function(){if(req.readyState==4)g(req.responseText)};req.send()}return false }
//--></script>"""+between_before_and_after.join(link(l) for l in b4)+tableAround+'<table border'+tableStyle+'><tbody><tr><td><a id="e" name="e"></a>'+link(line,q)+'</td></tr></tbody></table>'+tableAround+between_before_and_after.join(link(l) for l in aftr)+moreAfter,req=req)
def handle(url,req):
global web_adjuster_extension_url,web_adjuster_extension_url2
if url.startswith(web_adjuster_extension_url):
main(req)
return True
elif url.startswith(web_adjuster_extension_url2):
web_adjuster_extension_url,web_adjuster_extension_url2 = web_adjuster_extension_url2,web_adjuster_extension_url
try: main(req)
finally: web_adjuster_extension_url,web_adjuster_extension_url2 = web_adjuster_extension_url2,web_adjuster_extension_url
return True
if __name__=="__main__": main()