lthn.io/app/Core/Lang/TranslationMemory/FuzzyMatcher.php
Claude 41a90cbff8
feat: lthn.io API serving live chain data
Fixed: basePath self→static binding, namespace detection, event wiring,
SQLite cache, file cache driver. All Mod Boot classes converted to
$listens pattern for lifecycle event discovery.

Working endpoints:
- /v1/explorer/info — live chain height, difficulty, aliases
- /v1/explorer/stats — formatted chain statistics
- /v1/names/directory — alias directory grouped by type
- /v1/names/available/{name} — name availability check
- /v1/names/lookup/{name} — name details

Co-Authored-By: Charon <charon@lethean.io>
2026-04-03 17:17:42 +01:00

445 lines
12 KiB
PHP

<?php
/*
* Core PHP Framework
*
* Licensed under the European Union Public Licence (EUPL) v1.2.
* See LICENSE file for details.
*/
declare(strict_types=1);
namespace Core\Lang\TranslationMemory;
use Core\Lang\TranslationMemory\Contracts\TranslationMemoryRepository;
use Illuminate\Support\Collection;
/**
* Fuzzy Matcher for Translation Memory.
*
* Provides fuzzy matching capabilities to find similar translations
* when an exact match is not available. Uses multiple algorithms:
*
* - Levenshtein distance for character-level similarity
* - Token/word-based matching for structural similarity
* - N-gram matching for partial phrase matching
*
* Match scores are combined with the translation quality score
* to provide a confidence rating for each suggestion.
*
* Usage:
* $matcher = new FuzzyMatcher($repository);
* $suggestions = $matcher->findSimilar('Hello world', 'en_GB', 'de_DE', 0.7);
*
* Configuration in config/core.php:
* 'lang' => [
* 'translation_memory' => [
* 'fuzzy' => [
* 'min_similarity' => 0.6,
* 'max_results' => 10,
* 'algorithm' => 'combined', // levenshtein, token, ngram, combined
* ],
* ],
* ]
*/
class FuzzyMatcher
{
/**
* Default minimum similarity threshold.
*/
protected const DEFAULT_MIN_SIMILARITY = 0.6;
/**
* Default maximum results to return.
*/
protected const DEFAULT_MAX_RESULTS = 10;
/**
* N-gram size for n-gram matching.
*/
protected const NGRAM_SIZE = 3;
/**
* Create a new fuzzy matcher.
*/
public function __construct(
protected TranslationMemoryRepository $repository,
) {}
/**
* Find similar translations using fuzzy matching.
*
* @param string $source Source text to match
* @param string $sourceLocale Source locale
* @param string $targetLocale Target locale
* @param float|null $minSimilarity Minimum similarity threshold (0.0-1.0)
* @param int|null $maxResults Maximum number of results
* @return Collection<int, array{entry: TranslationMemoryEntry, similarity: float, confidence: float}>
*/
public function findSimilar(
string $source,
string $sourceLocale,
string $targetLocale,
?float $minSimilarity = null,
?int $maxResults = null,
): Collection {
$minSimilarity ??= config('core.lang.translation_memory.fuzzy.min_similarity', self::DEFAULT_MIN_SIMILARITY);
$maxResults ??= config('core.lang.translation_memory.fuzzy.max_results', self::DEFAULT_MAX_RESULTS);
$entries = $this->repository->findByLocalePair($sourceLocale, $targetLocale);
if ($entries->isEmpty()) {
return collect();
}
$normalizedSource = $this->normalize($source);
return $entries
->map(function (TranslationMemoryEntry $entry) use ($normalizedSource) {
$similarity = $this->calculateSimilarity($normalizedSource, $this->normalize($entry->getSource()));
// Combine similarity with quality score for overall confidence
$confidence = ($similarity * 0.7) + ($entry->getQuality() * 0.3);
return [
'entry' => $entry,
'similarity' => round($similarity, 4),
'confidence' => round($confidence, 4),
];
})
->filter(fn (array $match) => $match['similarity'] >= $minSimilarity)
->sortByDesc('confidence')
->take($maxResults)
->values();
}
/**
* Get the best match for a source text.
*
* @param string $source Source text to match
* @param string $sourceLocale Source locale
* @param string $targetLocale Target locale
* @param float|null $minSimilarity Minimum similarity threshold
* @return array{entry: TranslationMemoryEntry, similarity: float, confidence: float}|null
*/
public function getBestMatch(
string $source,
string $sourceLocale,
string $targetLocale,
?float $minSimilarity = null,
): ?array {
// First, try exact match
$exact = $this->repository->findExact($source, $sourceLocale, $targetLocale);
if ($exact !== null) {
return [
'entry' => $exact,
'similarity' => 1.0,
'confidence' => $exact->getQuality(),
];
}
// Fall back to fuzzy match
$matches = $this->findSimilar($source, $sourceLocale, $targetLocale, $minSimilarity, 1);
return $matches->first();
}
/**
* Calculate similarity between two strings using combined algorithms.
*
* @return float Similarity score (0.0-1.0)
*/
public function calculateSimilarity(string $a, string $b): float
{
if ($a === $b) {
return 1.0;
}
if (empty($a) || empty($b)) {
return 0.0;
}
$algorithm = config('core.lang.translation_memory.fuzzy.algorithm', 'combined');
return match ($algorithm) {
'levenshtein' => $this->levenshteinSimilarity($a, $b),
'token' => $this->tokenSimilarity($a, $b),
'ngram' => $this->ngramSimilarity($a, $b),
default => $this->combinedSimilarity($a, $b),
};
}
/**
* Calculate Levenshtein-based similarity.
*
* @return float Similarity score (0.0-1.0)
*/
protected function levenshteinSimilarity(string $a, string $b): float
{
$maxLen = max(mb_strlen($a), mb_strlen($b));
if ($maxLen === 0) {
return 1.0;
}
// Use built-in levenshtein for short strings, approximate for long
if ($maxLen <= 255) {
$distance = levenshtein($a, $b);
} else {
// For longer strings, use word-level Levenshtein
$wordsA = explode(' ', $a);
$wordsB = explode(' ', $b);
$distance = $this->arrayLevenshtein($wordsA, $wordsB);
$maxLen = max(count($wordsA), count($wordsB));
}
return 1.0 - ($distance / $maxLen);
}
/**
* Calculate token/word-based similarity.
*
* Uses Jaccard similarity on word tokens.
*
* @return float Similarity score (0.0-1.0)
*/
protected function tokenSimilarity(string $a, string $b): float
{
$tokensA = $this->tokenize($a);
$tokensB = $this->tokenize($b);
if (empty($tokensA) && empty($tokensB)) {
return 1.0;
}
if (empty($tokensA) || empty($tokensB)) {
return 0.0;
}
$intersection = array_intersect($tokensA, $tokensB);
$union = array_unique(array_merge($tokensA, $tokensB));
return count($intersection) / count($union);
}
/**
* Calculate n-gram similarity.
*
* Uses character n-grams for partial matching.
*
* @return float Similarity score (0.0-1.0)
*/
protected function ngramSimilarity(string $a, string $b): float
{
$ngramsA = $this->getNgrams($a);
$ngramsB = $this->getNgrams($b);
if (empty($ngramsA) && empty($ngramsB)) {
return 1.0;
}
if (empty($ngramsA) || empty($ngramsB)) {
return 0.0;
}
$intersection = array_intersect_key($ngramsA, $ngramsB);
$intersectionCount = 0;
foreach ($intersection as $ngram => $_) {
$intersectionCount += min($ngramsA[$ngram], $ngramsB[$ngram]);
}
$totalA = array_sum($ngramsA);
$totalB = array_sum($ngramsB);
// Dice coefficient
return (2 * $intersectionCount) / ($totalA + $totalB);
}
/**
* Calculate combined similarity using multiple algorithms.
*
* Weighted combination of Levenshtein, token, and n-gram similarity.
*
* @return float Similarity score (0.0-1.0)
*/
protected function combinedSimilarity(string $a, string $b): float
{
$levenshtein = $this->levenshteinSimilarity($a, $b);
$token = $this->tokenSimilarity($a, $b);
$ngram = $this->ngramSimilarity($a, $b);
// Weighted average: token similarity is most important for translations
return ($levenshtein * 0.25) + ($token * 0.50) + ($ngram * 0.25);
}
/**
* Normalize text for comparison.
*/
protected function normalize(string $text): string
{
// Lowercase
$text = mb_strtolower($text);
// Normalize whitespace
$text = preg_replace('/\s+/', ' ', $text);
// Remove leading/trailing whitespace
$text = trim($text);
return $text;
}
/**
* Tokenize text into words.
*
* @return array<string>
*/
protected function tokenize(string $text): array
{
// Split on word boundaries
$tokens = preg_split('/[\s\p{P}]+/u', $text, -1, PREG_SPLIT_NO_EMPTY);
return $tokens ?: [];
}
/**
* Get character n-grams from text.
*
* @return array<string, int> N-gram counts
*/
protected function getNgrams(string $text): array
{
$ngrams = [];
$len = mb_strlen($text);
if ($len < self::NGRAM_SIZE) {
// For short strings, use the whole string as an n-gram
if ($len > 0) {
$ngrams[$text] = 1;
}
return $ngrams;
}
for ($i = 0; $i <= $len - self::NGRAM_SIZE; $i++) {
$ngram = mb_substr($text, $i, self::NGRAM_SIZE);
if (! isset($ngrams[$ngram])) {
$ngrams[$ngram] = 0;
}
$ngrams[$ngram]++;
}
return $ngrams;
}
/**
* Calculate Levenshtein distance for arrays (word-level).
*
* @param array<string> $a
* @param array<string> $b
*/
protected function arrayLevenshtein(array $a, array $b): int
{
$m = count($a);
$n = count($b);
if ($m === 0) {
return $n;
}
if ($n === 0) {
return $m;
}
// Initialize the distance matrix
$d = [];
for ($i = 0; $i <= $m; $i++) {
$d[$i][0] = $i;
}
for ($j = 0; $j <= $n; $j++) {
$d[0][$j] = $j;
}
for ($i = 1; $i <= $m; $i++) {
for ($j = 1; $j <= $n; $j++) {
$cost = ($a[$i - 1] === $b[$j - 1]) ? 0 : 1;
$d[$i][$j] = min(
$d[$i - 1][$j] + 1, // Deletion
$d[$i][$j - 1] + 1, // Insertion
$d[$i - 1][$j - 1] + $cost // Substitution
);
}
}
return $d[$m][$n];
}
/**
* Suggest translations for multiple source texts.
*
* @param array<string> $sources Source texts to match
* @param string $sourceLocale Source locale
* @param string $targetLocale Target locale
* @param float|null $minSimilarity Minimum similarity threshold
* @return array<string, array{entry: TranslationMemoryEntry, similarity: float, confidence: float}|null>
*/
public function suggestBatch(
array $sources,
string $sourceLocale,
string $targetLocale,
?float $minSimilarity = null,
): array {
$results = [];
foreach ($sources as $source) {
$results[$source] = $this->getBestMatch($source, $sourceLocale, $targetLocale, $minSimilarity);
}
return $results;
}
/**
* Get similarity thresholds for categorizing matches.
*
* @return array{exact: float, high: float, medium: float, low: float}
*/
public static function getThresholds(): array
{
return [
'exact' => 1.0,
'high' => 0.9,
'medium' => 0.75,
'low' => 0.6,
];
}
/**
* Categorize a similarity score.
*
* @return string One of: 'exact', 'high', 'medium', 'low', 'none'
*/
public static function categorizeSimilarity(float $similarity): string
{
$thresholds = self::getThresholds();
if ($similarity >= $thresholds['exact']) {
return 'exact';
}
if ($similarity >= $thresholds['high']) {
return 'high';
}
if ($similarity >= $thresholds['medium']) {
return 'medium';
}
if ($similarity >= $thresholds['low']) {
return 'low';
}
return 'none';
}
}