#!/usr/bin/env python3

import slob
import sys
from collections import Counter

def slob_headwords(filename):
    with slob.open(filename) as s:
        # this slows things down A LOT:
        # return Counter(item.key for item in s
        #                if item.content_type.lower().startswith('text/html'))
        return Counter(item.key for item in s)

def print_headwords(filename):
    headwords = slob_headwords(filename)
    for word, count in headwords.items():
        print(word)
        if count > 1:
            dupes = f' ({count})' if count > 2 else ''
            print(f'duplicate headword: {word}{dupes}', file=sys.stderr)

def diff_headwords(filename1, filename2):
    hw1 = slob_headwords(filename1)
    hw2 = slob_headwords(filename2)
    def print_info(fname, hw):
        ndupes = sum(hw.values()) - len(hw)
        dupes = f' + {ndupes:,} duplicates' if ndupes else ''
        print(f'{fname}: {len(hw):,} unique headwords{dupes}')
    print_info(filename1, hw1)
    print_info(filename2, hw2)
    print(f'{len(hw2.keys()-hw1.keys()):,} additions / '
          f'{len(hw1.keys()-hw2.keys()):,} deletions')


if len(sys.argv) == 2:
    print_headwords(sys.argv[1])
elif len(sys.argv) == 3:
    diff_headwords(sys.argv[1], sys.argv[2])
else:
    sys.exit(f'usage: {sys.argv[0]} SLOB [SLOB2]')
