Source code for ffp.vocab

"""
Finalfusion vocabularies.
"""
from os import PathLike
from typing import Union

from ffp.io import ChunkIdentifier, find_chunk

from ffp.vocab.simple_vocab import load_simple_vocab, SimpleVocab
from ffp.vocab.subword import load_explicit_vocab, load_fasttext_vocab, \
    load_finalfusion_bucket_vocab, SubwordVocab, FastTextVocab,\
    FinalfusionBucketVocab, ExplicitVocab
from ffp.vocab.vocab import Vocab
from ffp.vocab.cutoff import Cutoff


[docs]def load_vocab(file: Union[str, bytes, int, PathLike]) -> Vocab: """ Load a vocabulary from a finalfusion file. Loads the first known vocabulary from a finalfusion file. Parameters ---------- file : str, bytes, int, PathLike Path to file containing a finalfusion vocab chunk. Returns ------- vocab : SimpleVocab, FastTextVocab, FinalfusionBucketVocab, ExplicitVocab First Vocab in the file. Raises ------ ValueError If the file did not contain a vocabulary. """ with open(file, "rb") as inf: chunk = find_chunk(inf, [ ChunkIdentifier.SimpleVocab, ChunkIdentifier.FastTextSubwordVocab, ChunkIdentifier.ExplicitSubwordVocab, ChunkIdentifier.BucketSubwordVocab ]) if chunk is None: raise ValueError('File did not contain a vocabulary') if chunk == ChunkIdentifier.SimpleVocab: return SimpleVocab.read_chunk(inf) if chunk == ChunkIdentifier.BucketSubwordVocab: return FinalfusionBucketVocab.read_chunk(inf) if chunk == ChunkIdentifier.ExplicitSubwordVocab: return ExplicitVocab.read_chunk(inf) if chunk == ChunkIdentifier.FastTextSubwordVocab: return FastTextVocab.read_chunk(inf) raise ValueError('Unexpected vocabulary chunk.')
__all__ = [ 'Vocab', 'ExplicitVocab', 'FinalfusionBucketVocab', 'FastTextVocab', 'SimpleVocab', 'load_finalfusion_bucket_vocab', 'load_fasttext_vocab', 'load_explicit_vocab', 'load_simple_vocab', 'load_vocab' ]