Source code for carpedm.data.lang

#
# Copyright (C) 2018 Neal Digre.
#
# This software may be modified and distributed under the terms
# of the MIT license. See the LICENSE file for details.

"""Language-specific and unicode utilities.

Todo:
    * Variable UNK token in Vocabulary
"""
import abc
import os


[docs]def code2hex(code): """Returns hex integer for a unicode string. The argument code could either be an ascii representation, (e.g. U+3055, <UNK>) or a unicode character. Args: code (str): Code to convert. Returns: int: """ try: _ = code.encode('ascii') except UnicodeEncodeError: code = char2code(code) # Code is either 'U+XXXX(X)' code or unknown format. code = code.lstrip('U+') if 'U+' in code else code try: result = int(code, 16) except ValueError: # Not a number, so probably just a raw ascii character. result = code return result
[docs]def code2char(code): """Returns the unicode string for the character.""" try: char = chr(code2hex(code)) except (ValueError, TypeError): char = code return char
[docs]def char2code(unicode): """Returns the ASCII code for a unicode character. Args: unicode (str): Raises: TypeError: string is length two. """ try: code = "U+{0:04x}".format(ord(unicode)) except TypeError: code = unicode return code
[docs]class CharacterSet(object): """Character set abstract class.""" __metaclass__ = abc.ABCMeta def __init__(self, charset, name=None): """Initializer Args: charset (:obj:`str` or :obj:`list` of :obj:`str`): ID for types of characters to include, if string, or a list of characters to include, either as ascii codes, e.g. U+3055, or unicode characters. """ self._ranges = self._unicode_ranges(charset) if name: self.name = name elif isinstance(charset, str): self.name = charset else: assert isinstance(charset, list) self.name = 'chars' @property def presets(self): """Pre-defined character sets. Returns: :obj:`list` of :obj:`str`: Character set IDs. """ return [] def _unicode_ranges(self, charset): """Returns appropriate unicode ranges for specified ``charset``. Args: charset (str or :obj:`list` of :obj:`str`): ID of character set to use. Returns: :obj:`list` of :obj:`tuple`: Unicode ranges [(low, high)] """ return [(code2hex(c),) for c in charset]
[docs] def in_charset(self, unicode): """Check if a character is in the defined character set. Args: unicode (str): String representation of unicode value. """ hexcode = code2hex(unicode) if any([(len(r) == 1 and hexcode == r[0]) or (len(r) == 2 and r[0] <= hexcode <= r[1]) for r in self._ranges]): return True else: return False
[docs]class JapaneseUnicodes(CharacterSet): """Utility for accessing and manipulating Japanese character unicodes. Inherits from :obj:`CharacterSet`. Unicode ranges taken from [1] with edits for exceptions. References: [1] http://www.unicode.org/charts/ """ PUNCTUATION = [ (int('25a0', 16), int('25ff', 16)), # square (int('25b2', 16), int('25b3', 16)), # triangle (int('25cb', 16), int('25cf', 16)), # circle (int('25ef', 16),), # big circle (int('3200', 16), int('32ff', 16)), # filled big circles (int('3000', 16), int('303f', 16)), # CJK symbols, punctuation (int('3099', 16), int('309e', 16)), # voicing, iteration marks (int('30a0', 16),), # double hyphen (int('30fb', 16), int('30fe', 16)), # dot, prolonged, iteration (int('ff5b', 16), int('ff64', 16)), # brackets, halfwidth punctuation (int('ffed', 16), int('ffee', 16)) # halfwidth square, circle ] HIRAGANA = [ (int('3040', 16), int('3096', 16)), (int('309f', 16),) # より ] KATAKANA = [ (int('30a1', 16), int('30fa', 16)), (int('30ff', 16),), # コト (int('ff65', 16), int('ff9d', 16)) # halfwidth ] KANA = HIRAGANA + KATAKANA MISC = [ (int('0030', 16), int('0039', 16)), # digits (int('ff00', 16), int('ff5a', 16)), # roman characters (int('ffa0', 16), int('ffdc', 16)), # hangul characters (int('ffe0', 16), int('ffec', 16)), # symbols # (int('003f', 16), int('003f', 16)), # question mark ] # Kanji covers full CJK set and extensions KANJI = [ (int('3400', 16), int('4db5', 16)), (int('4e00', 16), int('9fea', 16)), (int('f900', 16), int('fad9', 16)), (int('20000', 16), int('2ebe0', 16)), ] ALL = HIRAGANA + KATAKANA + KANJI + PUNCTUATION + MISC SETS = { 'hiragana': HIRAGANA, 'katakana': KATAKANA, 'kana': KANA, 'kanji': KANJI, 'punct': PUNCTUATION, 'misc': MISC, 'all': ALL } def __init__(self, charset): super(JapaneseUnicodes, self).__init__(charset)
[docs] def presets(self): return self.SETS.keys()
def _unicode_ranges(self, charset): if isinstance(charset, str): ranges = [] for k, s in self.SETS.items(): if k == 'kana': if k in charset and not 'katakana' in charset: ranges += s else: if k in charset: ranges += s else: ranges = super(JapaneseUnicodes, self)._unicode_ranges(charset) assert len(ranges) > 0, "Invalid character set." return ranges
[docs]class Vocabulary(object): """Simple vocabulary wrapper. References: Lightly modified TensorFlow "im2txt" `Vocabulary`_. .. _Vocabulary: https://github.com/tensorflow/models/blob/master/ research/im2txt/im2txt/data/build_mscoco_data.py """ UNK = "<UNK>" def __init__(self, reserved, vocab): """Initializes the vocabulary. Args: reserved (tuple): Tuple of reserved tokens. vocab: (list): List of vocabulary entries, ideally (for visualization) in descending order by frequency. """ self._vocab = {} self._reserved = reserved for ix, char in enumerate(vocab): self._vocab[char] = ix add2id = 0 for i in range(len(reserved)): if i in self._vocab.values(): add2id += 1 self._vocab = {key: idx + add2id for key, idx in self._vocab.items()} for i, char in enumerate(reserved): self._vocab[char] = i try: self._unk_id = reserved.index(self.UNK) except ValueError: print("'{}' token not provided. Setting to highest ID.".format( self.UNK )) self._vocab[self.UNK] = len(self._vocab) self._rev_vocab = {idx: key for key, idx in self._vocab.items()} def save(self, out_dir, as_unicode=False): types = self.types(as_unicode) with open(os.path.join(out_dir, 'vocab.txt'), 'w') as f: for t in types: f.write(t + '\n') def types(self, as_unicode=False): types = [code2char(self._rev_vocab[idx]) if as_unicode else self._rev_vocab[idx] for idx in sorted(self._rev_vocab.keys())] return types
[docs] def char_to_id(self, char): """Returns the integer id of a character string.""" if char in self._vocab: return self._vocab[char] else: return self._unk_id
[docs] def id_to_char(self, char_id): """Returns the character string of a integer id.""" if char_id in self._rev_vocab: return self._rev_vocab[char_id] else: return self.UNK
[docs] def get_num_classes(self): """Returns number of classes, includes <UNK>.""" return len(self._vocab)
[docs] def get_num_reserved(self): """Returns number of reserved IDs.""" return len(self._reserved)