Source code for icu_tokenizer.normalizer

# -*- coding: utf-8 -*-

import re
import unicodedata
from typing import Dict, List

import regex

from icu_tokenizer.utils import get_all_unicode_chars


[docs]class Normalizer(object): """Unicode information based normalizer. Does the following - Ensure NFKC format - Handle pseudo-spaces (for numbers) - Normalize by unicode categories \ https://www.fileformat.info/info/unicode/category/index.htm - ``[C*|So|Z*]`` → ' ' - ``[Pc]`` → ``_`` - ``[Pd]`` → ``-`` - ``[Pf|Pi]`` → ``"`` (except for ``'``) - ``[Ps]`` → ``(`` (except for ``{``, ``[``) - ``[Pe]`` → ``)`` (except for ``}``, ``]``) - Normalize Nd (Numbers) - Account for some outliers - Remove non printable characters - Normalize whitespace characters - Perform language specific normalization Usage: >>> normalizer = Normalizer(lang, norm_puncts=True) >>> norm_text: str = normalizer.normalize(text) """
[docs] def __init__(self, lang: str = 'en', norm_puncts: bool = False): """Normalizer. Args: lang (str, optional): Language identifier. Defaults to 'en'. norm_puncts (bool, optional): Normalize punctuations?. Defaults to False. """ # Handle control tokens self.ignore_pattern = regex.compile(r'\p{C}|\p{So}|\p{Z}') # Handle pseudo-spaces # Random note: it appears pseudo-spaces primarily makes a difference # when numbers are involved self.pseudo_num_pattern = re.compile(r'(\d) (\d)') # Punctuation and number replace maps self.num_pattern = regex.compile(r'\p{Nd}+') self.punct_replace_map = self.punct_pattern = None if norm_puncts: self.punct_replace_map = make_punct_replace_map() self.punct_pattern = \ make_pattern_from_keys(self.punct_replace_map.keys()) # Other language specific normalizers lang_replace_map = make_lang_specific_replace_map(lang) self.lang_replace_map = self.lang_replace_pattern = None if len(lang_replace_map) > 0: self.lang_replace_map = lang_replace_map self.lang_replace_pattern = \ make_pattern_from_keys(lang_replace_map.keys())
def _num_replace_fn(self, match: re.Match) -> str: return str(int(match.group(0))) def _punct_replace_fn(self, match: re.Match) -> str: return self.punct_replace_map[match.group(0)] def _lang_replace_fn(self, match: re.Match) -> str: return self.lang_replace_map[match.group(0)]
[docs] def normalize(self, text: str) -> str: """Perform normalization. Args: text (str): Input text Returns: str: Normalized text """ text = unicodedata.normalize('NFKC', text) text = self.pseudo_num_pattern.sub(r'\1.\2', text) text = self.num_pattern.sub(self._num_replace_fn, text) if self.punct_pattern is not None: text = self.punct_pattern.sub(self._punct_replace_fn, text) text = self.ignore_pattern.sub(' ', text) text = ' '.join(text.split()) # Normalize whitespace if self.lang_replace_pattern is not None: text = self.lang_replace_pattern(self._lang_replace_fn, text) return text
def make_pattern_from_keys(keys: List[str]) -> re.Pattern: """Make a re.Pattern that matches a list of strings.""" keys = sorted(keys, key=lambda x: len(x), reverse=True) pattern_str = r'|'.join(re.escape(k) for k in keys) return re.compile(pattern_str) def make_punct_replace_map() -> Dict[str, str]: """Make the punctuation replace map.""" # Generate punctuation and number replace maps punct_replace_map = {} # Normalization rules based on unicode category punct_exceptions = {"'", '[', ']', '{', '}'} for c in get_all_unicode_chars(): if c in punct_exceptions: continue cat = unicodedata.category(c) if cat == 'Pc': punct_replace_map[c] = '_' elif cat == 'Pd': punct_replace_map[c] = '-' elif cat == 'Pe': punct_replace_map[c] = ')' elif cat == 'Pf': punct_replace_map[c] = '"' elif cat == 'Pi': punct_replace_map[c] = '"' elif cat == 'Ps': punct_replace_map[c] = '(' # User provided rules # Soft hyphen punct_replace_map['\xad'] = '' # Double quotes punct_replace_map["''"] = '"' punct_replace_map["´´"] = '"' punct_replace_map['„'] = '"' # Apostrophes punct_replace_map["`"] = "'" punct_replace_map['´'] = "'" punct_replace_map['‘'] = "'" punct_replace_map['’'] = "'" punct_replace_map['‚'] = "'" # Not a comma # Brackets punct_replace_map['【'] = '[' punct_replace_map['】'] = ']' punct_replace_map['['] = '[' punct_replace_map[']'] = ']' # Common unicode variations punct_replace_map['∶'] = ':' punct_replace_map['?'] = '?' punct_replace_map['.'] = '.' punct_replace_map['━'] = '-' punct_replace_map['%'] = '%' # Chinese punctuations punct_replace_map['!'] = '!' punct_replace_map['、'] = ',' punct_replace_map['|'] = '|' punct_replace_map[':'] = ':' punct_replace_map[';'] = ';' punct_replace_map[','] = ',' punct_replace_map['。'] = '.' punct_replace_map['~'] = '~' # Others punct_replace_map['…'] = '...' return punct_replace_map def make_lang_specific_replace_map(lang: str = 'en') -> Dict[str, str]: """Create a language specific replace map.""" replace_map = {} if lang == 'ro': # Remove diacritics for romanian replace_map['Ş'] = 'S' replace_map['ş'] = 's' replace_map['Ș'] = 'S' replace_map['ș'] = 's' replace_map['Ţ'] = 'T' replace_map['ţ'] = 't' replace_map['Ț'] = 'T' replace_map['ț'] = 't' replace_map['Ă'] = 'A' replace_map['ă'] = 'a' replace_map['Â'] = 'A' replace_map['â'] = 'a' replace_map['Î'] = 'I' replace_map['î'] = 'i' return replace_map