1
0
mirror of https://github.com/ssb22/bits-and-bobs.git synced 2023-06-08 10:22:44 +00:00
bits-and-bobs/xml2dx.py

96 lines
4.3 KiB
Python

#!/usr/bin/env python2
# Simple script that takes XML (or non-XML) files,
# one record per line, and generates candidate entries
# for a word index into that data. Some of the entries
# will be of low quality and should be deleted.
# Silas S. Brown 2018 - public domain - no warranty
# Where to find history:
# on GitHub at https://github.com/ssb22/bits-and-bobs
# and on GitLab at https://gitlab.com/ssb22/bits-and-bobs
# and on BitBucket https://bitbucket.org/ssb22/bits-and-bobs
# and at https://gitlab.developers.cam.ac.uk/ssb22/bits-and-bobs
# and in China: https://gitee.com/ssb22/bits-and-bobs
index_only_inside = r"<html>.*?</html>" # or "" for all
mark_down_xml = True
lineNos_only = True
wordContext = 3 # num words on each side
max_phraseLen = 3
min_records_multiPhrase = 2 # multi-word phrase must match at least this number of different records to be indexed
stop_words = set(["a","all","are","an","and","as","at","be","but","by","can","for","have","if","in","of","on","that","the","this","to","too","some","usually","very","with"]) # etc
assert all(x==x.lower() for x in stop_words)
import sys, re
def candidate_phrases(words):
for phraseLen in xrange(1,max_phraseLen+1):
for start in xrange(0,len(words)-phraseLen):
w = words[start:start+phraseLen]
if phraseLen>1 and not(all(x[0].isalpha() and x[-1].isalpha() for x in w)): continue # don't cut across starting quotes, commas, etc (but hyphens in middle OK)
if not all(any(x.isalpha() for x in ww) for ww in w): continue # every word must have at least one alphabetical char for the phrase to make sense
if any(keywordify(ww) in stop_words for ww in w[:1]+w[-1:]): continue
if any("://" in ww for ww in w): continue # URLs
yield (start,phraseLen)
def capsInitial(w):
# ignore open-quote etc before 1st letter
for i in xrange(len(w)):
if w[i].isalpha():
return w[:i]+w[i].upper()+w[i+1:]
return w
def keywordify(w):
start,end = 0,len(w)
for i in xrange(len(w)):
if w[i].isalpha():
start = i ; break
for i in xrange(len(w)-1,start-1,-1):
if w[i].isalpha():
end = i+1 ; break
return w[start:end].lower()
def context(words,wordNo,phraseLen=1):
a,b = max(0,wordNo-wordContext),min(len(words),wordNo+phraseLen+wordContext)
r = []
if a: r.append("...")
r += [w.lower() for w in words[a:wordNo]]
r += [capsInitial(w) for w in words[wordNo:wordNo+phraseLen]] # TODO: syntax-highlight in some way? (if suitable output format)
r += [w.lower() for w in words[wordNo+phraseLen:b]]
if b < len(words): r.append("...")
return " ".join(r)
mDict = {}
lines = sys.stdin.read().decode('utf-8').split('\n')
lines2 = lines
if index_only_inside: lines2 = [" ".join(re.findall(index_only_inside,l)) for l in lines]
if mark_down_xml: lines2 = [re.sub("<[A-Za-z/][^>]*>","",l) for l in lines2]
if lineNos_only: lines = [str(x) for x in xrange(len(lines))]
for kwds,orig in zip(lines2,lines):
kwds = kwds.split()
for start,phraseLen in candidate_phrases(kwds):
c1 = " ".join(keywordify(k) for k in kwds[start:start+phraseLen])
c2 = context(kwds,start,phraseLen)
key = (phraseLen,c1)
if not key in mDict: mDict[key] = set()
mDict[key].add((c2,orig))
for (phraseLen,c1),cList in mDict.items():
if phraseLen==1: continue
if not (phraseLen,c1) in mDict: continue # already deleted on a previous iteration of this loop
origLineSet = set(y for x,y in cList)
if len(origLineSet) < min_records_multiPhrase:
del mDict[(phraseLen,c1)] ; continue
for s in xrange(phraseLen):
for e in xrange(s+1,phraseLen+1):
if (s,e) == (0,phraseLen): continue
k = (e-s," ".join(c1.split()[s:e]))
if k in mDict and set(y for x,y in mDict[k])==origLineSet: del mDict[k] # no point listing the shorter phrase if all its entries are duplicated by the longer one, and we can have 'B: see A B' entries in the final UI (TODO: do this even if MOST of its entries are duplicated? but we may or may not have chosen sensible continuation words etc)
out = []
for (phraseLen,c1),cList in mDict.items():
for c2,c3 in cList:
out.append((c1,c2,c3))
for c1,c2,c3 in sorted(out):
print (c1+"\t"+c2+"\t"+c3).encode('utf-8')