Source code for ffp.vocab.subword

"""
Finalfusion Subword Vocabularies
"""

import collections
import struct
from abc import abstractmethod
from os import PathLike
from typing import List, Optional, Tuple, Any, Union, Dict, BinaryIO

from ffp.io import ChunkIdentifier, find_chunk, _write_binary, _read_binary, Chunk
from ffp.subwords import ExplicitIndexer, FastTextIndexer, FinalfusionHashIndexer, word_ngrams
from ffp.vocab.vocab import Vocab, _validate_words_and_create_index, _calculate_serialized_size, \
    _write_words_binary, _read_items
from ffp.vocab.cutoff import Cutoff, _count_words, _filter_and_sort


[docs]class SubwordVocab(Vocab):
    """
    Interface for vocabularies with subword lookups.
    """
[docs]    def idx(self, item: str, default=None) -> Optional[Union[List[int], int]]:
        idx = self.word_index.get(item)
        if idx is not None:
            return idx
        subwords = self.subword_indices(item)
        if subwords:
            return subwords
        return default

    @property
    def idx_bound(self) -> int:
        return len(self) + self.subword_indexer.idx_bound

    @property
    def min_n(self) -> int:
        """
        Get the lower bound of the range of extracted n-grams.

        Returns
        -------
        min_n : int
            lower bound of n-gram range.
        """
        return self.subword_indexer.min_n

    @property
    def max_n(self) -> int:
        """
        Get the upper bound of the range of extracted n-grams.

        Returns
        -------
        max_n : int
            upper bound of n-gram range.
        """
        return self.subword_indexer.max_n

    @property
    @abstractmethod
    def subword_indexer(
            self
    ) -> Union[ExplicitIndexer, FinalfusionHashIndexer, FastTextIndexer]:
        """
        Get this vocab's subword Indexer.

        The subword indexer produces indices for n-grams.

        In case of bucket vocabularies, this is a hash-based indexer
        (:class:`.FinalfusionHashIndexer`, :class:`.FastTextIndexer`). For explicit subword
        vocabularies, this is an :class:`.ExplicitIndexer`.

        Returns
        -------
        subword_indexer : ExplicitIndexer, FinalfusionHashIndexer, FastTextIndexer
            The subword indexer of the vocabulary.
        """
[docs]    def subwords(self, item: str, bracket: bool = True) -> List[str]:
        """
        Get the n-grams of the given item as a list.

        The n-gram range is determined by the `min_n` and `max_n` values.

        Parameters
        ----------
        item : str
            The query item to extract n-grams from.
        bracket : bool
            Toggles bracketing the item with '<' and '>' before extraction.

        Returns
        -------
        ngrams : List[str]
            List of n-grams.
        """
        return word_ngrams(item, self.min_n, self.max_n, bracket)

[docs]    def subword_indices(self, item: str, bracket: bool = True) -> List[int]:
        """
        Get the subword indices for the given item.

        This list does not contain the index for known items.

        Parameters
        ----------
        item : str
            The query item.
        bracket : bool
            Toggles bracketing the item with '<' and '>' before extraction.

        Returns
        -------
        indices : List[int]
            The list of subword indices.
        """
        return self.subword_indexer.subword_indices(item,
                                                    offset=len(self.words),
                                                    bracket=bracket)

    def __getitem__(self, item: str) -> Union[int, List[int]]:
        idx = self.word_index.get(item)
        if idx is not None:
            return idx
        subwords = self.subword_indices(item)
        if subwords:
            return subwords
        raise KeyError(f"No indices found for {item}")

    def __eq__(self, other: Any) -> bool:
        if not isinstance(other, type(self)):
            return False
        if self.min_n != other.min_n:
            return False
        if self.max_n != other.max_n:
            return False
        return super(SubwordVocab, self).__eq__(other)


[docs]class FinalfusionBucketVocab(Chunk, SubwordVocab):
    """
    Finalfusion Bucket Vocabulary.
    """
[docs]    def __init__(self,
                 words: List[str],
                 indexer: FinalfusionHashIndexer = None,
                 index: Optional[Dict[str, int]] = None):
        """
        Initialize a FinalfusionBucketVocab.

        Initializes the vocabulary with the given words and optional index and
        indexer.

        If no indexer is passed, a FinalfusionHashIndexer with bucket exponent
        21 is used.

        If no index is given, the nth word in the `words` list is assigned
        index `n`. The word list cannot contain duplicate entries and it needs
        to be of same length as the index.

        Parameters
        ----------
        words : List[str]
            List of unique words
        indexer : FinalfusionHashIndexer, optional
            Subword indexer to use for the vocabulary. Defaults to an indexer
            with 2^21 buckets with range 3-6.
        index : Dict[str, int], optional
            Dictionary providing an entry -> index mapping.

        Raises
        ------
        ValueError
            if the length of `index` and `word` doesn't match.
        AssertionError
            If the indexer is not a FinalfusionHashIndexer.
        """
        if indexer is None:
            indexer = FinalfusionHashIndexer(21)
        assert isinstance(indexer, FinalfusionHashIndexer)
        super().__init__()
        self._index = _validate_words_and_create_index(words, index)
        self._words = words
        self._indexer = indexer

[docs]    @staticmethod
    def from_corpus(
            file: Union[str, bytes, int, PathLike],
            cutoff: Optional[Cutoff] = None,
            indexer: Optional[FinalfusionHashIndexer] = None,
    ) -> Tuple['FinalfusionBucketVocab', List[int]]:
        """
        Build a Finalfusion Bucket Vocabulary from a corpus.

        Parameters
        ----------
        file : str, bytes, int, PathLike
            File with white-space separated tokens.
        cutoff : Cutoff
            Frequency cutoff or target size to restrict vocabulary size. Defaults to
            minimum frequency cutoff of 30.
        indexer : FinalfusionHashIndexer
            Subword indexer to use for the vocabulary. Defaults to an indexer
            with 2^21 buckets with range 3-6.

        Returns
        -------
        (vocab, counts) : Tuple[FinalfusionBucketVocab, List[int]]
            Tuple containing the Vocabulary as first item and counts of in-vocabulary items
            as the second item.

        Raises
        ------
        AssertionError
            If the indexer is not a FinalfusionHashIndexer.
        """
        assert indexer is None or isinstance(indexer, FinalfusionHashIndexer)
        cnt = _count_words(file)
        if cutoff is None:
            cutoff = Cutoff(30, mode='min_freq')
        words, counts = _filter_and_sort(cnt, cutoff)
        return FinalfusionBucketVocab(words, indexer), counts

[docs]    def to_explicit(self) -> 'ExplicitVocab':
        """
        Returns a Vocabulary with explicit storage built from this vocab.

        Returns
        -------
        explicit_vocab : ExplicitVocab
            The converted vocabulary.
        """
        return _to_explicit(self)

[docs]    def write_chunk(self, file: BinaryIO):
        _write_bucket_vocab(self, file)

    @property
    def subword_indexer(self) -> FinalfusionHashIndexer:
        return self._indexer

    @property
    def words(self) -> list:
        return self._words

    @property
    def word_index(self) -> dict:
        return self._index

[docs]    @staticmethod
    def read_chunk(file: BinaryIO) -> 'FinalfusionBucketVocab':
        length, min_n, max_n, buckets = _read_binary(file, "<QIII")
        words, index = _read_items(file, length)
        indexer = FinalfusionHashIndexer(buckets, min_n, max_n)
        return FinalfusionBucketVocab(words, indexer, index)

[docs]    @staticmethod
    def chunk_identifier():
        return ChunkIdentifier.BucketSubwordVocab

    def __eq__(self, other):
        if not isinstance(other, type(self)):
            return False
        if not isinstance(other.subword_indexer, type(self.subword_indexer)):
            return False
        if self.subword_indexer.idx_bound != other.subword_indexer.idx_bound:
            return False
        return super(FinalfusionBucketVocab, self).__eq__(other)


[docs]class FastTextVocab(Chunk, SubwordVocab):
    """
    FastText vocabulary
    """
[docs]    def __init__(self,
                 words: List[str],
                 indexer: FastTextIndexer = None,
                 index: Optional[Dict[str, int]] = None):
        """
        Initialize a FastTextVocab.

        Initializes the vocabulary with the given words and optional index and
        indexer.

        If no indexer is passed, a FastTextIndexer with 2,000,000 buckets is
        used.

        If no index is given, the nth word in the `words` list is assigned
        index `n`. The word list cannot contain duplicate entries and it needs
        to be of same length as the index.

        Parameters
        ----------
        words : List[str]
            List of unique words
        indexer : FastTextIndexer, optional
            Subword indexer to use for the vocabulary. Defaults to an indexer
            with 2,000,000 buckets with range 3-6.
        index : Dict[str, int], optional
            Dictionary providing an entry -> index mapping.

        Raises
        ------
        ValueError
            if the length of `index` and `word` doesn't match.
        AssertionError
            If the indexer is not a FastTextIndexer.
        """
        if indexer is None:
            indexer = FastTextIndexer(2000000)
        assert isinstance(indexer, FastTextIndexer)
        super().__init__()
        self._index = _validate_words_and_create_index(words, index)
        self._words = words
        self._indexer = indexer

[docs]    @staticmethod
    def from_corpus(
            file: Union[str, bytes, int, PathLike],
            cutoff: Optional[Cutoff] = None,
            indexer: Optional[FastTextIndexer] = None,
    ) -> Tuple['FastTextVocab', List[int]]:
        """
        Build a fastText vocabulary from a corpus.

        Parameters
        ----------
        file: str, bytes, int, PathLike
            File with white-space separated tokens.
        cutoff : Cutoff, optional
            Frequency cutoff or target size to restrict vocabulary size. Defaults to
            minimum frequency cutoff of 30.
        indexer : FastTextIndexer, optional
            Subword indexer to use for the vocabulary. Defaults to an indexer
            with 2,000,000 buckets with range 3-6.

        Returns
        -------
        (vocab, counts) : Tuple[FastTextVocab, List[int]]
            Tuple containing the Vocabulary as first item and counts of in-vocabulary items
            as the second item.

        Raises
        ------
        AssertionError
            If the indexer is not a FastTextIndexer.
        """
        assert indexer is None or isinstance(indexer, FastTextIndexer)
        cnt = _count_words(file)
        if cutoff is None:
            cutoff = Cutoff(30, mode='min_freq')
        words, counts = _filter_and_sort(cnt, cutoff)
        return FastTextVocab(words, indexer), counts

[docs]    def to_explicit(self) -> 'ExplicitVocab':
        """
        Returns a Vocabulary with explicit storage built from this vocab.

        Returns
        -------
        explicit_vocab : ExplicitVocab
            The converted vocabulary.
        """
        return _to_explicit(self)

    @property
    def subword_indexer(self) -> FastTextIndexer:
        return self._indexer

    @property
    def words(self) -> list:
        return self._words

    @property
    def word_index(self) -> dict:
        return self._index

[docs]    @staticmethod
    def read_chunk(file: BinaryIO) -> 'FastTextVocab':
        length, min_n, max_n, buckets = _read_binary(file, "<QIII")
        words, index = _read_items(file, length)
        indexer = FastTextIndexer(buckets, min_n, max_n)
        return FastTextVocab(words, indexer, index)

[docs]    def write_chunk(self, file: BinaryIO):
        _write_bucket_vocab(self, file)

[docs]    @staticmethod
    def chunk_identifier():
        return ChunkIdentifier.FastTextSubwordVocab

    def __eq__(self, other):
        if not isinstance(other, type(self)):
            return False
        if not isinstance(other.subword_indexer, type(self.subword_indexer)):
            return False
        if self.subword_indexer.idx_bound != other.subword_indexer.idx_bound:
            return False
        return super(FastTextVocab, self).__eq__(other)


[docs]class ExplicitVocab(Chunk, SubwordVocab):
    """
    A vocabulary with explicitly stored n-grams.
    """
[docs]    def __init__(self,
                 words: List[str],
                 indexer: ExplicitIndexer,
                 index: Dict[str, int] = None):
        """
        Initialize an ExplicitVocab.

        Initializes the vocabulary with the given words, subword indexer and an
        optional word index.

        If no index is given, the nth word in the `words` list is assigned
        index `n`. The word list cannot contain duplicate entries and it needs
        to be of same length as the index.

        Parameters
        ----------
        words : List[str]
            List of unique words
        indexer : ExplicitIndexer
            Subword indexer to use for the vocabulary.
        index : Dict[str, int], optional
            Dictionary providing a word -> index mapping.

        Raises
        ------
        ValueError
            if the length of ``index`` and ``word`` doesn't match.
        AssertionError
            If the indexer is not an ExplicitIndexer.

        See Also
        --------
        :class:`.ExplicitIndexer`
        """
        assert isinstance(indexer, ExplicitIndexer)
        super().__init__()
        self._index = _validate_words_and_create_index(words, index)
        self._words = words
        self._indexer = indexer

[docs]    @staticmethod
    def from_corpus(file: Union[str, bytes, int, PathLike],
                    ngram_range=(3, 6),
                    token_cutoff: Optional[Cutoff] = None,
                    ngram_cutoff: Optional[Cutoff] = None):
        """
        Build an ExplicitVocab from a corpus.

        Parameters
        ----------
        file: str, bytes, int, PathLike
            File with white-space separated tokens.
        ngram_range : Tuple[int, int]
            Specifies the n-gram range for the indexer.
        token_cutoff : Cutoff, optional
            Frequency cutoff or target size to restrict token vocabulary size. Defaults to
            minimum frequency cutoff of 30.
        ngram_cutoff : Cutoff, optional
            Frequency cutoff or target size to restrict ngram vocabulary size. Defaults to
            minimum frequency cutoff of 30.

        Returns
        -------
        (vocab, counts) : Tuple[FastTextVocab, List[int], List[int]]
            Tuple containing the Vocabulary as first item, counts of in-vocabulary tokens
            as the second item and in-vocabulary ngram counts as the last item.
        """
        min_n, max_n = ngram_range
        cnt = _count_words(file)
        ngram_cnt = collections.Counter()
        for word, count in cnt.items():
            for ngram in word_ngrams(word, min_n, max_n):
                ngram_cnt[ngram] += count
        words, tok_cnt = _filter_and_sort(cnt, token_cutoff)
        ngrams, ngram_cnt = _filter_and_sort(ngram_cnt, ngram_cutoff)
        indexer = ExplicitIndexer(ngrams, ngram_range=ngram_range)
        return ExplicitVocab(words, indexer), tok_cnt, ngram_cnt

    @property
    def words(self) -> list:
        return self._words

    @property
    def word_index(self) -> dict:
        return self._index

    @property
    def subword_indexer(self) -> ExplicitIndexer:
        return self._indexer

[docs]    @staticmethod
    def chunk_identifier():
        return ChunkIdentifier.ExplicitSubwordVocab

[docs]    @staticmethod
    def read_chunk(file: BinaryIO) -> 'ExplicitVocab':
        length, ngram_length, min_n, max_n = _read_binary(file, "<QQII")
        words, word_index = _read_items(file, length)
        ngrams, ngram_index = _read_items(file, ngram_length, indices=True)
        indexer = ExplicitIndexer(ngrams, (min_n, max_n), ngram_index)
        return ExplicitVocab(words, indexer, word_index)

[docs]    def write_chunk(self, file) -> None:
        chunk_length = _calculate_serialized_size(self.words)
        chunk_length += _calculate_serialized_size(self.subword_indexer.ngrams)
        min_n_max_n_size = struct.calcsize("<II")
        chunk_length += min_n_max_n_size
        chunk_header = (int(self.chunk_identifier()), chunk_length,
                        len(self.words), len(self.subword_indexer.ngrams),
                        self.min_n, self.max_n)
        _write_binary(file, "<IQQQII", *chunk_header)
        _write_words_binary((bytes(word, "utf-8") for word in self.words),
                            file)
        for ngram in self.subword_indexer.ngrams:
            b_ngram = ngram.encode("utf-8")
            _write_binary(file, "<I", len(b_ngram))
            file.write(b_ngram)
            _write_binary(file, "<Q", self.subword_indexer.ngram_index[ngram])

    def __eq__(self, other):
        if not isinstance(other, ExplicitVocab):
            return False
        if not isinstance(other.subword_indexer, ExplicitIndexer):
            return False
        if self.subword_indexer.idx_bound != other.subword_indexer.idx_bound:
            return False
        if self.subword_indexer.ngrams != other.subword_indexer.ngrams:
            return False
        if self.subword_indexer.ngram_index != other.subword_indexer.ngram_index:
            return False
        return super(ExplicitVocab, self).__eq__(other)


[docs]def load_finalfusion_bucket_vocab(file: Union[str, bytes, int, PathLike]
                                  ) -> FinalfusionBucketVocab:
    """
    Load a FinalfusionBucketVocab from the given finalfusion file.

    Parameters
    ----------
    file : str, bytes, int, PathLike
        Path to file containing a FinalfusionBucketVocab chunk.

    Returns
    -------
    vocab : FinalfusionBucketVocab
        Returns the first FinalfusionBucketVocab in the file.
    """
    with open(file, "rb") as inf:
        chunk = find_chunk(inf, [ChunkIdentifier.BucketSubwordVocab])
        if chunk is None:
            raise ValueError('File did not contain a FinalfusionBucketVocab}')
        return FinalfusionBucketVocab.read_chunk(inf)


[docs]def load_fasttext_vocab(file: Union[str, bytes, int, PathLike]
                        ) -> FastTextVocab:
    """
    Load a FastTextVocab from the given finalfusion file.

    Parameters
    ----------
    file : str, bytes, int, PathLike
        Path to file containing a FastTextVocab chunk.

    Returns
    -------
    vocab : FastTextVocab
        Returns the first FastTextVocab in the file.
    """
    with open(file, "rb") as inf:
        chunk = find_chunk(inf, [ChunkIdentifier.FastTextSubwordVocab])
        if chunk is None:
            raise ValueError('File did not contain a FastTextVocab}')
        return FastTextVocab.read_chunk(inf)


[docs]def load_explicit_vocab(file: Union[str, bytes, int, PathLike]
                        ) -> ExplicitVocab:
    """
    Load a ExplicitVocab from the given finalfusion file.

    Parameters
    ----------
    file : str, bytes, int, PathLike
        Path to file containing a ExplicitVocab chunk.

    Returns
    -------
    vocab : ExplicitVocab
        Returns the first ExplicitVocab in the file.
    """
    with open(file, "rb") as inf:
        chunk = find_chunk(inf, [ChunkIdentifier.ExplicitSubwordVocab])
        if chunk is None:
            raise ValueError('File did not contain a ExplicitVocab}')
        return ExplicitVocab.read_chunk(inf)


def _to_explicit(vocab: Union[FinalfusionBucketVocab, FastTextVocab]
                 ) -> 'ExplicitVocab':
    """
    Convert a bucket vocabulary to an explicit vocab.
    :return: ExplicitVocab
    """
    ngram_index = dict()
    idx_index = dict()
    ngrams = []
    for word in vocab.words:
        token_ngrams = vocab.subwords(word)
        for ngram in token_ngrams:
            if ngram not in ngram_index:
                ngrams.append(ngram)
                idx = vocab.subword_indexer(ngram)
                if idx not in idx_index:
                    idx_index[idx] = len(idx_index)
                ngram_index[ngram] = idx_index[idx]
    indexer = ExplicitIndexer(ngrams, (vocab.min_n, vocab.max_n), ngram_index)
    return ExplicitVocab(vocab.words, indexer, vocab.word_index)


def _write_bucket_vocab(vocab: Union[FinalfusionBucketVocab, FastTextVocab],
                        file: BinaryIO):
    min_n_max_n_size = struct.calcsize("<II")
    buckets_size = struct.calcsize("<I")
    chunk_length = _calculate_serialized_size(vocab.words)
    chunk_length += min_n_max_n_size
    chunk_length += buckets_size

    chunk_id = vocab.chunk_identifier()
    if chunk_id == ChunkIdentifier.FastTextSubwordVocab:
        buckets = vocab.subword_indexer.idx_bound
    else:
        buckets = vocab.subword_indexer.buckets_exp

    chunk_header = (int(chunk_id), chunk_length, len(vocab.words), vocab.min_n,
                    vocab.max_n, buckets)
    _write_binary(file, "<IQQIII", *chunk_header)
    _write_words_binary((bytes(word, "utf-8") for word in vocab.words), file)


__all__ = [
    'SubwordVocab', 'FastTextVocab', 'FinalfusionBucketVocab', 'ExplicitVocab',
    'load_explicit_vocab', 'load_fasttext_vocab',
    'load_finalfusion_bucket_vocab'
]