Scripts/writing-britamerican.py

#!/usr/bin/env python3
# encoding: utf-8

"""
File: britamerican.py
Author: Paul Lajoie-Mazenc
Description: Checks for british and american spellings in a file. This is just
a basic thing, it may have lots of false positives/negatives.

Inspired from Nicholas J. Higham's “Handbook of Writing for the Mathematical
Sciences”
"""

import re
import argparse

# Words that have a different spelling
# (british, american) spelling
WORDS = [('behaviour', 'behavior'), ('colour', 'color'),
         ('catalogue', 'catalog'), ('centre', 'center'), ('defence', 'defense'),
         ('grey', 'gray'), ('manoeuvre', 'maneuver'),
         ('marvellous', 'marvelous'), ('modelled', 'modeled'),
         ('modelling', 'modeling'), ('skilful', 'skillful'),
         ('speciality', 'specialty'), ('acknowledgement', 'acknowledgment'),
         ('benefited', 'benefitted'), ('encyclopaedia', 'encyclopedia'),
         ('focused', 'focussed'), ('judgement', 'judgment'),
         ('appendices', 'appendixes'), ('formulae', 'formulas'),
         ('indices', 'indexes'), ('lemmata', 'lemmas'),
         ('vertices', 'vertexes'), ('optimisation', 'optimization')]
BRITISH = [word[0] for word in WORDS]
AMERICAN = [word[1] for word in WORDS]

# Exceptions for the *ise words, mostly verbs
# All the other *ise verbs should be *ise in british and *ize in american
EXCEPTIONS = ['advise', 'arise', 'circumcise', 'comprise', 'compromise',
    'concise', 'demise', 'despise', 'devise', 'disguise', 'excise', 'exercise',
    'expertise', 'franchise', 'guise', 'improvise', 'incise', 'likewise',
    'otherwise', 'precise', 'premise', 'promise', 'reprise', 'revise', 'rise',
    'size', 'scriptsize', 'footnotesize', 'supervise', 'surmise', 'surprise',
    'televise', 'treatise', 'wise']

# Detects words
re_words = re.compile('\\w+')
# Gets the *ise[ds] and *ize[ds]
re_ise = re.compile('\\b\\w+ise[ds]?\\b')
re_ize = re.compile('\\b\\w+ize[ds]?\\b')
# Gets the *yse[ds] and *yze[ds]
re_yse = re.compile('\\b\\w+yse[ds]?\\b')
re_yze = re.compile('\\b\\w+yze[ds]?\\b')
# The word ends with a d or an s
re_suffix = re.compile('^\\w+[ds]$')


def parse_args():
    """ Parses the arguments of the command line """
    parser = argparse.ArgumentParser(
            description="Checks a file for british and american spellings")

    parser.add_argument('files', metavar="files", type=str, nargs='+',
            help='file where to check the spellings')

    return parser.parse_args()

def check_british(text):
    """ Checks text for british words """
    return [word for word in text if word in BRITISH]

def check_american(text):
    """ Checks text for american words """
    return [word for word in text if word in AMERICAN]

def check_ise(text):
    """ Checks for words ending in ise[ds]? """
    return re_ise.findall(text)

def check_ize(text):
    """ Checks for words ending in ize[ds]? """
    return re_ize.findall(text)

def check_yse(text):
    """ Checks for words ending in yse[ds]? """
    return re_yse.findall(text)

def check_yze(text):
    """ Checks for words ending in yze[ds]? """
    return re_yze.findall(text)

def root(word):
    """ Gets the root of a word (ie removes the 'd' or 's' of past participle or plurals/conjugation """
    if re_suffix.match(word):
        return word[:-1]
    return word

def remove_exceptions(words):
    """ Removes exceptions from the resulting words """
    return [word for word in words if root(word) not in EXCEPTIONS]

def get_words(line):
    """ Gets the american and british spellings in text """
    british = []
    american = []

    line = line.lower()

    # British/American words
    words = re_words.findall(line)
    british.extend(check_british(words))
    american.extend(check_american(words))

    # -ise/-ize verbs
    british.extend(check_ise(line))
    american.extend(check_ize(line))

    # -yse/-yze verbs
    british.extend(check_yse(line))
    american.extend(check_yze(line))

    british = remove_exceptions(british)
    american = remove_exceptions(american)

    return british, american

def check_line(line, index):
    """ Checks the text for american and british spellings

    The formatting is correctly aligned for < 10,000 lines"""
    british, american = get_words(line)
    british_prefix = '\033[91m' + "UK" + '\033[0m'
    american_prefix = '\033[92m' + "US" + '\033[0m'
    if len(british) > 0 or len(american) > 0:
        pad = ''
        print("{:<4d}: ".format(index + 1), end='')
        if len(british) > 0:
            print("{}: {}".format(british_prefix, british))
            pad = ' '*6
        if len(american) > 0:
            print("{}{}: {}".format(pad, american_prefix, american))

def main():
    """ Main function """
    files = parse_args().files

    for file_ in files:
        try:
            fd = open(file_)
            lines = fd.readlines()
            fd.close()
        except IOError:
            print("Couldn't read file {}, skipping it".format(file_))
            break

        print(file_)
        for index, line in enumerate(lines):
            check_line(line, index)

if __name__ == '__main__':
    main()