indexer/ohi_online.py

#!/usr/bin/env python
# (works in both Python 2 and Python 3)

# Online HTML Indexer v1.34 (c) 2013-18,2020,2022 Silas S. Brown.

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# See comments in ohi.py for what this is about.
# Although the offline files will also work ONline, in
# bandwidth-limited situations you might be better using
# this lookup CGI.  This version can also take multiple
# adjacent anchors, giving alternate labels to the same
# fragment; there should not be any whitespace between
# adjacent anchors.

# Configuration
# -------------

# You can change these variables here, but if you do then
# it might be more difficult to upgrade to newer versions
# of this script.  However, any file called ohi_config.py
# (in the current directory or 1 level up) will be read.

html_filename = "input.html" # set this to whatever
# - and when that file changes, this script will update
# files with that plus .index, .header and .footer
# (it might be a good idea to do a separate run of this
# script, from the command line, to perform that update,
# especially if you're on a slow machine and the webserver
# has a short timeout; note that the update does NOT have
# to be done on the same machine, as long as the resulting
# files can be copied across)
# (another speedup is to get a small wrapper script to
# import ohi_online; the compiled version can then be used
# after the first time)

alphabet = "abcdefghijklmnopqrstuvwxyz" # set to None for all characters and case-sensitive; any headings not containing ANY of these characters will be put in as-is anyway

# ignore_text_in_parentheses NOT available in the online version because it could make it impossible to fetch entries that differ from others only in parenthetical additions (unless you merge the entries, which might not be a good idea)

more_sensible_punctuation_sort_order = True

remove_utf8_diacritics = True # or False, for removing diacritics in index headings (not in main text);
# assumes UTF-8.  (Letters with diacritics will be treated as though they did not have any.)

frontpage_lookup_prompt = "Lookup: "
shorter_lookup_prompt   = "Lookup: "

lines_before = 5 ; lines_after = 10
max_show_more = 50 ; increment = 10

between_before_and_after = "<br>"
# For more compactness, try this instead:
# between_before_and_after = " | "
# (depends on what sort of data you have though)

# You can override these functions if you want:
def preprocess_result(markup): return markup
def links_to_related_services(query): return "" # e.g. "Here | <a href...>Somewhere else</a>"

code_to_run_when_DOM_changes = ""
# you can set this to any Javascript to run after our JS
# manages to change the DOM (on capable browsers), e.g. to
# fix some typography when browser support is detected

web_adjuster_extension_mode = False
# If set to True, this module's handle() will work - see
# Web Adjuster 'extensions' option for more details.
# If set to False, we just behave as a CGI script.

web_adjuster_extension_url = "http://example.org/ohi.cgi"
web_adjuster_extension_url2 = "http://localhost/ohi.cgi"

cgi_name = "ohi.cgi" # for rewriting <a href="#..."> links

# Where to find history:
# on GitHub at https://github.com/ssb22/indexer
# and on GitLab at https://gitlab.com/ssb22/indexer
# and on BitBucket https://bitbucket.org/ssb22/indexer
# and at https://gitlab.developers.cam.ac.uk/ssb22/indexer
# and in China: https://gitee.com/ssb22/indexer

# ------------------------------------------

# allow overrides:
import sys ; sys.path = ['.','..'] + sys.path
try: import ohi_config
except ImportError: ohi_config = None

if not web_adjuster_extension_mode:
    import cgitb ; cgitb.enable() # remove this if you don't want tracebacks in the browser

import mmap, os, cgi, re
try: from urllib import quote # Python 2
except ImportError: from urllib.parse import quote # Python 3
if ohi_config:
    ohi_config.quote = quote # so functions there can use it
    from ohi_config import *

try: xrange
except: xrange = range # Python 3
def B(s):
    if type(s)==type(u""): return s.encode('utf-8')
    else: return s

def create_linemap(fName):
    f = open(fName,"rb")
    lm = LineMap(f.fileno(), 0, access=mmap.ACCESS_READ)
    lm.f = f # ensure not closed by gc
    return lm
class LineMap(mmap.mmap): # might fail in old Python versions where mmap isn't a class
    def linesAround(self,txt,linesBefore,linesAfter):
        "returns (before,line,after), up to numLines lines either side of the line appropriate for txt"
        self.seek(self.bisect(txt))
        linesBefore = sum(self.back_line() for i in xrange(linesBefore))
        return [self.readline() for i in xrange(linesBefore)],self.readline(),[x for x in [self.readline() for i in xrange(linesAfter)] if x]
    def bisect(self,txt,lo=0,hi=-1):
        "returns pos of start of appropriate line"
        txt = B(txt)
        if hi==-1: hi=len(self)
        elif hi <= lo:
            # return self.lineStart(hi)
            # amendment: if only the first few characters matched, it's possible that the PREVIOUS entry will match more characters (positioning is rarely helped by an inserted character, e.g. a pinyin shen/sheng confusion, and we probably want to draw more attention to the previous entries in this case, especially if the following entries are completely different e.g. 'shi'; TODO: could even do full 'first entry that matches as many characters as possible' logic)
            ret = self.lineStart(hi)
            if ret==0 or self[ret:ret+len(txt)]==txt: return ret # all characters match current line, or there are no previous lines
            txt2 = txt
            while len(txt2)>1 and not self[ret:ret+len(txt2)]==txt2: txt2 = txt2[:-1] # delete characters from the end until all that are left match current line
            ret2 = self.lineStart(ret-1)
            if self[ret2:ret2+len(txt2)+1]==txt[:len(txt2)+1]: return ret2 # return previous line if they match that as well
            else: return ret
        lWidth,uWidth = int((hi-lo)/2),int((hi-lo+1)/2)
        lMid = self.lineStart(lo+lWidth)
        lLine = self.lineAt(lMid)
        if lLine < txt: return self.bisect(txt,lMid+len(lLine),hi)
        else: return self.bisect(txt,lo,lMid)
    def lineStart(self,pos):
        return self.rfind(B("\n"),0,pos)+1 # (for start of file, rfind will return -1 so this+1 is still what we want)
    def lineAt(self,pos):
        self.seek(pos) ; return self.readline()
    def back_line(self):
        p = self.tell()
        if not p: return 0
        elif self[p-1:p]==B('\n'):
            self.seek(self.lineStart(p-1))
        else: self.seek(self.lineStart(p))
        return 1

if alphabet and more_sensible_punctuation_sort_order: alphaOnly = lambda x: re.sub('([;,]);+',r'\1',''.join(c for c in x.lower().replace('-',' ').replace(',','~COM~').replace(';',',').replace('~COM~',';').replace(' ',';') if c in alphabet+',;')) # gives ; < , == space (useful if ; is used to separate definitions and , is used before extra words to be added at the start; better set space EQUAL to comma, not higher, or will end up in wrong place if user inputs something forgetting the comma)
elif alphabet: alphaOnly = lambda x: ''.join(c for c in x.lower() if c in alphabet)
elif more_sensible_punctuation_sort_order: alphaOnly = lambda x: re.sub('([;,]);+',r'\1',x.replace('-',' ').replace(',','~COM~').replace(';',',').replace('~COM~',';').replace(' ',';'))
else: alphaOnly = lambda x:x
def ST(x):
    if type(x)==type(""): return x # Python 2
    return x.decode('utf-8') # Python 3
if more_sensible_punctuation_sort_order: undo_alphaOnly_swap = lambda x:ST(x).replace(';',' ').replace(',',';')
else: undo_alphaOnly_swap = lambda x:x
def U(s):
    if type(s)==type(u""): return s
    return s.decode('utf-8')
def S(s):
    if type(u"")==type(""): return s # Python 3
    else: return s.encode('utf-8') # Python 2
if remove_utf8_diacritics:
    _ao = alphaOnly ; import unicodedata
    alphaOnly = lambda x: _ao(S(u''.join((c for c in unicodedata.normalize('NFD',U(x)) if not unicodedata.category(c).startswith('M')))))

def load(fName):
  txt = create_linemap(fName)
  try:
    if os.stat(fName).st_mtime <= os.stat(fName+".index").st_mtime:
      return txt,create_linemap(fName+".index"),open(fName+".header").read(),open(fName+".footer").read()
  except OSError: pass
  ret = {}
  contentStart = 0 ; header="" ; tag = ""
  altTags = []
  for m in re.finditer(B(r'<a name="([^"]*)"></a>'),txt):
    # First, output the content from the PREVIOUS tag:
    if contentStart and contentStart==m.start():
        # oops, previous tag has NO content, so treat it as an 'alternate heading' to the tag we're about to have:
        altTags.append(tag)
    else:
        for ttag in [tag]+altTags:
            tag2 = alphaOnly(ttag)
            if not tag2: tag2 = ttag
            if contentStart:
                if not tag2 in ret: ret[tag2] = (ttag,[])
                ret[tag2][1].append("\t"+str(contentStart)+"\t"+str(m.start()))
            else: # we're on the first tag
                assert not altTags
                header=txt[:m.start()]
                if type(u"")==type(""): header=header.decode('utf-8') # Python 3
        altTags = []
    # Now look at the new tag:
    tag = m.group(1) ; contentStart = m.end()
    if type(u"")==type(""): tag=tag.decode('utf-8') # Python 3
  footer = txt[contentStart:]
  if type(u"")==type(""): footer=footer.decode('utf-8') # Python 3
  if not header.strip(): header='<html><head><meta name="mobileoptimized" content="0"><meta name="viewport" content="width=device-width"><script>if(window.matchMedia && window.matchMedia("(prefers-color-scheme: dark)").matches)document.write("<style>body { background-color: black; color: #c0c000; } a { color: #00b000; }</style>");</script></head><body>'
  if not footer.strip(): footer = '</body></html>'
  try: ret = ret.iteritems() # Python 2
  except: ret = ret.items() # Python 3
  ret = [tag2+"\t"+ttag+"".join(rest)+"\n" for tag2,(ttag,rest) in ret] ; ret.sort()
  open(fName+".index","w").write("".join(ret))
  open(fName+".header","w").write(header)
  open(fName+".footer","w").write(footer)
  return txt,create_linemap(fName+".index"),header,footer

if web_adjuster_extension_mode: cginame = web_adjuster_extension_url[web_adjuster_extension_url.rindex('/')+1:]
else:
  cginame = os.sep+sys.argv[0] ; cginame=cginame[cginame.rindex(os.sep)+1:]

def queryForm(prompt): return "<form action=\""+cginame+"\">"+prompt+'<input type="text" name="q"><input type="Submit" value="OK"></form>'
def out(html="",req=None):
  if not html: html='<script><!--\ndocument.forms[0].q.focus();\n//--></script>' # TODO: else which browsers need <br> after the </form> in the line below?
  html = queryForm(shorter_lookup_prompt)+html
  if req:
      req.set_header('Content-type','text/html; charset=utf-8')
      req.write(B(header+html+footer))
  else: print ("Content-type: text/html; charset=utf-8\n\n"+header+html+footer)
def link(l,highl=""):
  l,linkText,rest = U(l).split('\t',2) ; highl = U(highl)
  mismatch = u""
  while highl and not l.startswith(highl): highl,mismatch=highl[:-1],highl[-1]+mismatch
  i = j = 0
  for c in highl:
    matched = (linkText[i]==c)
    if matched or (alphabet and not linkText[i] in alphabet and not linkText[i] in l):
      i += 1
      if matched: j = i
    else: break
  if j:
      matchedPart,nextPart = linkText[:j],linkText[j:]
      if nextPart and not nextPart.startswith(" ") and mismatch and not mismatch.startswith(" "): # show a red border around the mismatched letter to reinforce what happened (but ensure it's a border, not font colour, because we don't know what the user's background colour is)
          nextPart="<span style=\"border: thin red solid\">"+nextPart[0]+"</span>"+nextPart[1:]
      linkText = '<b>'+matchedPart+'</b>'+nextPart
  l,linkText=S(l),S(linkText)
  return '<a href="'+cginame+'?q='+quote(undo_alphaOnly_swap(l))+'&e=1" onclick="return tryInline(this)">'+linkText+'</a>' # (this gives a 'click to expand/collapse' option on browsers that support it, TODO: configurable?  option to have onMouseOver previews somewhere??  careful as could run into trouble with user CSS files)
  # (Could shorten l to the shortest unique part of the word, but that might not be a good idea if the data can change while users are online)

def redir(base,rest,req=None):
  if not base:
      if web_adjuster_extension_mode: base = web_adjuster_extension_url
      else: base=os.environ.get("SCRIPT_URI",cginame) # cginame would make it a relative redirect, which might or might not work with the browser/server
  if req:
      req.set_status(302)
      req.set_header("Location",base+rest)
      return
  print ("Status: 302") # TODO: check this works on all servers
  print ("Location: "+base+rest)
  print ("")

def linkSub(txt): return re.sub(r'(?i)<a href=("?)#',r'<a href=\1'+cgi_name+'?e=1&q=',ST(txt))

def main(req=None):
  if req: query = req.request.arguments
  elif web_adjuster_extension_mode:
      load(html_filename)
      sys.stderr.write("Index is now up-to-date\n")
      return
  else: query = cgi.parse()
  def qGet(k,default=""):
      v = query.get(k,default)
      if type(v)==list: v=v[0]
      if type(v)==str: v=v.strip() # TODO: or just .lstrip() ?  (accidental spaces entered on mobile devices)
      return v
  q = qGet("q")
  a = int(qGet("a",lines_after))
  b = int(qGet("b",lines_before))
  e = qGet("e")
  if q and not e and a==lines_after and b==lines_before and not query.get("t",""): return redir("","?q="+quote(undo_alphaOnly_swap(q))+"&t=1#e",req=req)
  global header,footer
  txt,index,header,footer = load(html_filename)
  if not q: return out(req=req)
  q,q0 = alphaOnly(q),q
  if not q: q = q0
  if e:
    ranges = ST(index.linesAround(q,0,0)[1]).split("\t")[2:]
    toOut = preprocess_result("<hr>".join(linkSub(txt[int(a):int(b)]) for a,b in zip(ranges[::2], ranges[1::2])))
    if e=="2":
        if req:
            req.set_header('Content-type','text/plain; charset=utf-8')
            req.write(toOut)
        else: print ("Content-type: text/plain; charset=utf-8\n\n"+toOut) # for the XMLHttpRequest
        return
    else: return out(toOut,req=req)
  b4,line,aftr = index.linesAround(q,b,a)
  lnks = links_to_related_services(q0)
  if lnks: lnks += '<hr>'
  def more(a,b,tag,label): return ('<a name="%s" href="%s?q=%s&a=%d&b=%d#%s">%s</a>' % (tag,cginame,quote(undo_alphaOnly_swap(q)),a,b,tag,label)) # 'after' version of this works only if it's at the very bottom of the page, so the words above it are still on-screen when jumping to its hash
  if b < max_show_more and len(b4)==b: moreBefore = more(a,min(b+increment,max_show_more),"b","&lt;&lt; more")+between_before_and_after
  else: moreBefore = '<a name="b"></a>'
  if a < max_show_more and len(aftr)==a: moreAfter = between_before_and_after+more(min(a+increment,max_show_more),b,"a","more &gt;&gt;")
  else: moreAfter = '<a name="a"></a>'
  if not '<' in between_before_and_after: tableStyle,tableAround = ' style="display:inline-table"',between_before_and_after
  else: tableStyle,tableAround = "",""
  out(lnks+moreBefore+"""<script><!--
  function tryInline(l) { l.onclick=function(){return false}; if(!(XMLHttpRequest&&l.innerHTML)) return true; var n=document.createElement("div"); l.parentNode.insertBefore(n,l.nextSibling); n.innerHTML="Loading"; if(n.innerHTML!="Loading") return true; n.setAttribute("style","border:thin blue solid"); function g(h){l.myStuff=h;n.innerHTML=h;if(l.parentNode.nodeName=='TD') l.parentNode.parentNode.parentNode.parentNode.style.display='block';l.onclick=function(){l.parentNode.removeChild(n);if(l.parentNode.nodeName=='TD') l.parentNode.parentNode.parentNode.parentNode.style.display='inline-table';l.onclick=function(){return tryInline(l)};return false};"""+code_to_run_when_DOM_changes+"""}; if(l.myStuff) g(l.myStuff);else{var req=new XMLHttpRequest();req.open("GET",l.href.replace("&e=1","&e=2"),true);req.onreadystatechange=function(){if(req.readyState==4)g(req.responseText)};req.send()}return false }
//--></script>"""+between_before_and_after.join(link(l) for l in b4)+tableAround+'<table border'+tableStyle+'><tbody><tr><td><a id="e" name="e"></a>'+link(line,q)+'</td></tr></tbody></table>'+tableAround+between_before_and_after.join(link(l) for l in aftr)+moreAfter,req=req)

def handle(url,req):
    global web_adjuster_extension_url,web_adjuster_extension_url2
    if url.startswith(web_adjuster_extension_url):
        main(req)
        return True
    elif url.startswith(web_adjuster_extension_url2):
        web_adjuster_extension_url,web_adjuster_extension_url2 = web_adjuster_extension_url2,web_adjuster_extension_url
        try: main(req)
        finally: web_adjuster_extension_url,web_adjuster_extension_url2 = web_adjuster_extension_url2,web_adjuster_extension_url
        return True

if __name__=="__main__": main()