396 lines
12 KiB
Python
396 lines
12 KiB
Python
import importlib
|
|
from codecs import IncrementalDecoder
|
|
from collections import Counter
|
|
from functools import lru_cache
|
|
from typing import Counter as TypeCounter, Dict, List, Optional, Tuple
|
|
|
|
from .constant import (
|
|
FREQUENCIES,
|
|
KO_NAMES,
|
|
LANGUAGE_SUPPORTED_COUNT,
|
|
TOO_SMALL_SEQUENCE,
|
|
ZH_NAMES,
|
|
)
|
|
from .md import is_suspiciously_successive_range
|
|
from .models import CoherenceMatches
|
|
from .utils import (
|
|
is_accentuated,
|
|
is_latin,
|
|
is_multi_byte_encoding,
|
|
is_unicode_range_secondary,
|
|
unicode_range,
|
|
)
|
|
|
|
|
|
def encoding_unicode_range(iana_name: str) -> List[str]:
|
|
"""
|
|
Return associated unicode ranges in a single byte code page.
|
|
"""
|
|
if is_multi_byte_encoding(iana_name):
|
|
raise IOError("Function not supported on multi-byte code page")
|
|
|
|
decoder = importlib.import_module(
|
|
"encodings.{}".format(iana_name)
|
|
).IncrementalDecoder
|
|
|
|
p: IncrementalDecoder = decoder(errors="ignore")
|
|
seen_ranges: Dict[str, int] = {}
|
|
character_count: int = 0
|
|
|
|
for i in range(0x40, 0xFF):
|
|
chunk: str = p.decode(bytes([i]))
|
|
|
|
if chunk:
|
|
character_range: Optional[str] = unicode_range(chunk)
|
|
|
|
if character_range is None:
|
|
continue
|
|
|
|
if is_unicode_range_secondary(character_range) is False:
|
|
if character_range not in seen_ranges:
|
|
seen_ranges[character_range] = 0
|
|
seen_ranges[character_range] += 1
|
|
character_count += 1
|
|
|
|
return sorted(
|
|
[
|
|
character_range
|
|
for character_range in seen_ranges
|
|
if seen_ranges[character_range] / character_count >= 0.15
|
|
]
|
|
)
|
|
|
|
|
|
def unicode_range_languages(primary_range: str) -> List[str]:
|
|
"""
|
|
Return inferred languages used with a unicode range.
|
|
"""
|
|
languages: List[str] = []
|
|
|
|
for language, characters in FREQUENCIES.items():
|
|
for character in characters:
|
|
if unicode_range(character) == primary_range:
|
|
languages.append(language)
|
|
break
|
|
|
|
return languages
|
|
|
|
|
|
@lru_cache()
|
|
def encoding_languages(iana_name: str) -> List[str]:
|
|
"""
|
|
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
|
|
This function does the correspondence.
|
|
"""
|
|
unicode_ranges: List[str] = encoding_unicode_range(iana_name)
|
|
primary_range: Optional[str] = None
|
|
|
|
for specified_range in unicode_ranges:
|
|
if "Latin" not in specified_range:
|
|
primary_range = specified_range
|
|
break
|
|
|
|
if primary_range is None:
|
|
return ["Latin Based"]
|
|
|
|
return unicode_range_languages(primary_range)
|
|
|
|
|
|
@lru_cache()
|
|
def mb_encoding_languages(iana_name: str) -> List[str]:
|
|
"""
|
|
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
|
|
This function does the correspondence.
|
|
"""
|
|
if (
|
|
iana_name.startswith("shift_")
|
|
or iana_name.startswith("iso2022_jp")
|
|
or iana_name.startswith("euc_j")
|
|
or iana_name == "cp932"
|
|
):
|
|
return ["Japanese"]
|
|
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
|
|
return ["Chinese"]
|
|
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
|
|
return ["Korean"]
|
|
|
|
return []
|
|
|
|
|
|
@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
|
|
def get_target_features(language: str) -> Tuple[bool, bool]:
|
|
"""
|
|
Determine main aspects from a supported language if it contains accents and if is pure Latin.
|
|
"""
|
|
target_have_accents: bool = False
|
|
target_pure_latin: bool = True
|
|
|
|
for character in FREQUENCIES[language]:
|
|
if not target_have_accents and is_accentuated(character):
|
|
target_have_accents = True
|
|
if target_pure_latin and is_latin(character) is False:
|
|
target_pure_latin = False
|
|
|
|
return target_have_accents, target_pure_latin
|
|
|
|
|
|
def alphabet_languages(
|
|
characters: List[str], ignore_non_latin: bool = False
|
|
) -> List[str]:
|
|
"""
|
|
Return associated languages associated to given characters.
|
|
"""
|
|
languages: List[Tuple[str, float]] = []
|
|
|
|
source_have_accents = any(is_accentuated(character) for character in characters)
|
|
|
|
for language, language_characters in FREQUENCIES.items():
|
|
target_have_accents, target_pure_latin = get_target_features(language)
|
|
|
|
if ignore_non_latin and target_pure_latin is False:
|
|
continue
|
|
|
|
if target_have_accents is False and source_have_accents:
|
|
continue
|
|
|
|
character_count: int = len(language_characters)
|
|
|
|
character_match_count: int = len(
|
|
[c for c in language_characters if c in characters]
|
|
)
|
|
|
|
ratio: float = character_match_count / character_count
|
|
|
|
if ratio >= 0.2:
|
|
languages.append((language, ratio))
|
|
|
|
languages = sorted(languages, key=lambda x: x[1], reverse=True)
|
|
|
|
return [compatible_language[0] for compatible_language in languages]
|
|
|
|
|
|
def characters_popularity_compare(
|
|
language: str, ordered_characters: List[str]
|
|
) -> float:
|
|
"""
|
|
Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
|
|
The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
|
|
Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
|
|
"""
|
|
if language not in FREQUENCIES:
|
|
raise ValueError("{} not available".format(language))
|
|
|
|
character_approved_count: int = 0
|
|
FREQUENCIES_language_set = set(FREQUENCIES[language])
|
|
|
|
ordered_characters_count: int = len(ordered_characters)
|
|
target_language_characters_count: int = len(FREQUENCIES[language])
|
|
|
|
large_alphabet: bool = target_language_characters_count > 26
|
|
|
|
for character, character_rank in zip(
|
|
ordered_characters, range(0, ordered_characters_count)
|
|
):
|
|
if character not in FREQUENCIES_language_set:
|
|
continue
|
|
|
|
character_rank_in_language: int = FREQUENCIES[language].index(character)
|
|
expected_projection_ratio: float = (
|
|
target_language_characters_count / ordered_characters_count
|
|
)
|
|
character_rank_projection: int = int(character_rank * expected_projection_ratio)
|
|
|
|
if (
|
|
large_alphabet is False
|
|
and abs(character_rank_projection - character_rank_in_language) > 4
|
|
):
|
|
continue
|
|
|
|
if (
|
|
large_alphabet is True
|
|
and abs(character_rank_projection - character_rank_in_language)
|
|
< target_language_characters_count / 3
|
|
):
|
|
character_approved_count += 1
|
|
continue
|
|
|
|
characters_before_source: List[str] = FREQUENCIES[language][
|
|
0:character_rank_in_language
|
|
]
|
|
characters_after_source: List[str] = FREQUENCIES[language][
|
|
character_rank_in_language:
|
|
]
|
|
characters_before: List[str] = ordered_characters[0:character_rank]
|
|
characters_after: List[str] = ordered_characters[character_rank:]
|
|
|
|
before_match_count: int = len(
|
|
set(characters_before) & set(characters_before_source)
|
|
)
|
|
|
|
after_match_count: int = len(
|
|
set(characters_after) & set(characters_after_source)
|
|
)
|
|
|
|
if len(characters_before_source) == 0 and before_match_count <= 4:
|
|
character_approved_count += 1
|
|
continue
|
|
|
|
if len(characters_after_source) == 0 and after_match_count <= 4:
|
|
character_approved_count += 1
|
|
continue
|
|
|
|
if (
|
|
before_match_count / len(characters_before_source) >= 0.4
|
|
or after_match_count / len(characters_after_source) >= 0.4
|
|
):
|
|
character_approved_count += 1
|
|
continue
|
|
|
|
return character_approved_count / len(ordered_characters)
|
|
|
|
|
|
def alpha_unicode_split(decoded_sequence: str) -> List[str]:
|
|
"""
|
|
Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
|
|
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
|
|
One containing the latin letters and the other hebrew.
|
|
"""
|
|
layers: Dict[str, str] = {}
|
|
|
|
for character in decoded_sequence:
|
|
if character.isalpha() is False:
|
|
continue
|
|
|
|
character_range: Optional[str] = unicode_range(character)
|
|
|
|
if character_range is None:
|
|
continue
|
|
|
|
layer_target_range: Optional[str] = None
|
|
|
|
for discovered_range in layers:
|
|
if (
|
|
is_suspiciously_successive_range(discovered_range, character_range)
|
|
is False
|
|
):
|
|
layer_target_range = discovered_range
|
|
break
|
|
|
|
if layer_target_range is None:
|
|
layer_target_range = character_range
|
|
|
|
if layer_target_range not in layers:
|
|
layers[layer_target_range] = character.lower()
|
|
continue
|
|
|
|
layers[layer_target_range] += character.lower()
|
|
|
|
return list(layers.values())
|
|
|
|
|
|
def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches:
|
|
"""
|
|
This function merge results previously given by the function coherence_ratio.
|
|
The return type is the same as coherence_ratio.
|
|
"""
|
|
per_language_ratios: Dict[str, List[float]] = {}
|
|
for result in results:
|
|
for sub_result in result:
|
|
language, ratio = sub_result
|
|
if language not in per_language_ratios:
|
|
per_language_ratios[language] = [ratio]
|
|
continue
|
|
per_language_ratios[language].append(ratio)
|
|
|
|
merge = [
|
|
(
|
|
language,
|
|
round(
|
|
sum(per_language_ratios[language]) / len(per_language_ratios[language]),
|
|
4,
|
|
),
|
|
)
|
|
for language in per_language_ratios
|
|
]
|
|
|
|
return sorted(merge, key=lambda x: x[1], reverse=True)
|
|
|
|
|
|
def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
|
|
"""
|
|
We shall NOT return "English—" in CoherenceMatches because it is an alternative
|
|
of "English". This function only keeps the best match and remove the em-dash in it.
|
|
"""
|
|
index_results: Dict[str, List[float]] = dict()
|
|
|
|
for result in results:
|
|
language, ratio = result
|
|
no_em_name: str = language.replace("—", "")
|
|
|
|
if no_em_name not in index_results:
|
|
index_results[no_em_name] = []
|
|
|
|
index_results[no_em_name].append(ratio)
|
|
|
|
if any(len(index_results[e]) > 1 for e in index_results):
|
|
filtered_results: CoherenceMatches = []
|
|
|
|
for language in index_results:
|
|
filtered_results.append((language, max(index_results[language])))
|
|
|
|
return filtered_results
|
|
|
|
return results
|
|
|
|
|
|
@lru_cache(maxsize=2048)
|
|
def coherence_ratio(
|
|
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None
|
|
) -> CoherenceMatches:
|
|
"""
|
|
Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
|
|
A layer = Character extraction by alphabets/ranges.
|
|
"""
|
|
|
|
results: List[Tuple[str, float]] = []
|
|
ignore_non_latin: bool = False
|
|
|
|
sufficient_match_count: int = 0
|
|
|
|
lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
|
|
if "Latin Based" in lg_inclusion_list:
|
|
ignore_non_latin = True
|
|
lg_inclusion_list.remove("Latin Based")
|
|
|
|
for layer in alpha_unicode_split(decoded_sequence):
|
|
sequence_frequencies: TypeCounter[str] = Counter(layer)
|
|
most_common = sequence_frequencies.most_common()
|
|
|
|
character_count: int = sum(o for c, o in most_common)
|
|
|
|
if character_count <= TOO_SMALL_SEQUENCE:
|
|
continue
|
|
|
|
popular_character_ordered: List[str] = [c for c, o in most_common]
|
|
|
|
for language in lg_inclusion_list or alphabet_languages(
|
|
popular_character_ordered, ignore_non_latin
|
|
):
|
|
ratio: float = characters_popularity_compare(
|
|
language, popular_character_ordered
|
|
)
|
|
|
|
if ratio < threshold:
|
|
continue
|
|
elif ratio >= 0.8:
|
|
sufficient_match_count += 1
|
|
|
|
results.append((language, round(ratio, 4)))
|
|
|
|
if sufficient_match_count >= 3:
|
|
break
|
|
|
|
return sorted(
|
|
filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
|
|
)
|