158 lines
5 KiB
Python
158 lines
5 KiB
Python
|
#!/usr/bin/env python3
|
||
|
# encoding: utf-8
|
||
|
|
||
|
"""
|
||
|
File: britamerican.py
|
||
|
Author: Paul Lajoie-Mazenc
|
||
|
Description: Checks for british and american spellings in a file. This is just
|
||
|
a basic thing, it may have lots of false positives/negatives.
|
||
|
|
||
|
Inspired from Nicholas J. Higham's “Handbook of Writing for the Mathematical
|
||
|
Sciences”
|
||
|
"""
|
||
|
|
||
|
import re
|
||
|
import argparse
|
||
|
|
||
|
# Words that have a different spelling
|
||
|
# (british, american) spelling
|
||
|
WORDS = [('behaviour', 'behavior'), ('colour', 'color'),
|
||
|
('catalogue', 'catalog'), ('centre', 'center'), ('defence', 'defense'),
|
||
|
('grey', 'gray'), ('manoeuvre', 'maneuver'),
|
||
|
('marvellous', 'marvelous'), ('modelled', 'modeled'),
|
||
|
('modelling', 'modeling'), ('skilful', 'skillful'),
|
||
|
('speciality', 'specialty'), ('acknowledgement', 'acknowledgment'),
|
||
|
('benefited', 'benefitted'), ('encyclopaedia', 'encyclopedia'),
|
||
|
('focused', 'focussed'), ('judgement', 'judgment'),
|
||
|
('appendices', 'appendixes'), ('formulae', 'formulas'),
|
||
|
('indices', 'indexes'), ('lemmata', 'lemmas'),
|
||
|
('vertices', 'vertexes'), ('optimisation', 'optimization')]
|
||
|
BRITISH = [word[0] for word in WORDS]
|
||
|
AMERICAN = [word[1] for word in WORDS]
|
||
|
|
||
|
# Exceptions for the *ise words, mostly verbs
|
||
|
# All the other *ise verbs should be *ise in british and *ize in american
|
||
|
EXCEPTIONS = ['advise', 'arise', 'circumcise', 'comprise', 'compromise',
|
||
|
'concise', 'demise', 'despise', 'devise', 'disguise', 'excise', 'exercise',
|
||
|
'expertise', 'franchise', 'guise', 'improvise', 'incise', 'likewise',
|
||
|
'otherwise', 'precise', 'premise', 'promise', 'reprise', 'revise', 'rise',
|
||
|
'size', 'scriptsize', 'footnotesize', 'supervise', 'surmise', 'surprise',
|
||
|
'televise', 'treatise', 'wise']
|
||
|
|
||
|
# Detects words
|
||
|
re_words = re.compile('\\w+')
|
||
|
# Gets the *ise[ds] and *ize[ds]
|
||
|
re_ise = re.compile('\\b\\w+ise[ds]?\\b')
|
||
|
re_ize = re.compile('\\b\\w+ize[ds]?\\b')
|
||
|
# Gets the *yse[ds] and *yze[ds]
|
||
|
re_yse = re.compile('\\b\\w+yse[ds]?\\b')
|
||
|
re_yze = re.compile('\\b\\w+yze[ds]?\\b')
|
||
|
# The word ends with a d or an s
|
||
|
re_suffix = re.compile('^\\w+[ds]$')
|
||
|
|
||
|
|
||
|
def parse_args():
|
||
|
""" Parses the arguments of the command line """
|
||
|
parser = argparse.ArgumentParser(
|
||
|
description="Checks a file for british and american spellings")
|
||
|
|
||
|
parser.add_argument('files', metavar="files", type=str, nargs='+',
|
||
|
help='file where to check the spellings')
|
||
|
|
||
|
return parser.parse_args()
|
||
|
|
||
|
def check_british(text):
|
||
|
""" Checks text for british words """
|
||
|
return [word for word in text if word in BRITISH]
|
||
|
|
||
|
def check_american(text):
|
||
|
""" Checks text for american words """
|
||
|
return [word for word in text if word in AMERICAN]
|
||
|
|
||
|
def check_ise(text):
|
||
|
""" Checks for words ending in ise[ds]? """
|
||
|
return re_ise.findall(text)
|
||
|
|
||
|
def check_ize(text):
|
||
|
""" Checks for words ending in ize[ds]? """
|
||
|
return re_ize.findall(text)
|
||
|
|
||
|
def check_yse(text):
|
||
|
""" Checks for words ending in yse[ds]? """
|
||
|
return re_yse.findall(text)
|
||
|
|
||
|
def check_yze(text):
|
||
|
""" Checks for words ending in yze[ds]? """
|
||
|
return re_yze.findall(text)
|
||
|
|
||
|
def root(word):
|
||
|
""" Gets the root of a word (ie removes the 'd' or 's' of past participle or plurals/conjugation """
|
||
|
if re_suffix.match(word):
|
||
|
return word[:-1]
|
||
|
return word
|
||
|
|
||
|
def remove_exceptions(words):
|
||
|
""" Removes exceptions from the resulting words """
|
||
|
return [word for word in words if root(word) not in EXCEPTIONS]
|
||
|
|
||
|
def get_words(line):
|
||
|
""" Gets the american and british spellings in text """
|
||
|
british = []
|
||
|
american = []
|
||
|
|
||
|
line = line.lower()
|
||
|
|
||
|
# British/American words
|
||
|
words = re_words.findall(line)
|
||
|
british.extend(check_british(words))
|
||
|
american.extend(check_american(words))
|
||
|
|
||
|
# -ise/-ize verbs
|
||
|
british.extend(check_ise(line))
|
||
|
american.extend(check_ize(line))
|
||
|
|
||
|
# -yse/-yze verbs
|
||
|
british.extend(check_yse(line))
|
||
|
american.extend(check_yze(line))
|
||
|
|
||
|
british = remove_exceptions(british)
|
||
|
american = remove_exceptions(american)
|
||
|
|
||
|
return british, american
|
||
|
|
||
|
def check_line(line, index):
|
||
|
""" Checks the text for american and british spellings
|
||
|
|
||
|
The formatting is correctly aligned for < 10,000 lines"""
|
||
|
british, american = get_words(line)
|
||
|
british_prefix = '\033[91m' + "UK" + '\033[0m'
|
||
|
american_prefix = '\033[92m' + "US" + '\033[0m'
|
||
|
if len(british) > 0 or len(american) > 0:
|
||
|
pad = ''
|
||
|
print("{:<4d}: ".format(index + 1), end='')
|
||
|
if len(british) > 0:
|
||
|
print("{}: {}".format(british_prefix, british))
|
||
|
pad = ' '*6
|
||
|
if len(american) > 0:
|
||
|
print("{}{}: {}".format(pad, american_prefix, american))
|
||
|
|
||
|
def main():
|
||
|
""" Main function """
|
||
|
files = parse_args().files
|
||
|
|
||
|
for file_ in files:
|
||
|
try:
|
||
|
fd = open(file_)
|
||
|
lines = fd.readlines()
|
||
|
fd.close()
|
||
|
except IOError:
|
||
|
print("Couldn't read file {}, skipping it".format(file_))
|
||
|
break
|
||
|
|
||
|
print(file_)
|
||
|
for index, line in enumerate(lines):
|
||
|
check_line(line, index)
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
main()
|