mirror of
git://jb55.com/damus
synced 2024-09-28 16:00:43 +00:00
translate: implement string distance for close matches
Implement the levenshtein string distance algorithm for determining if the translation is too similar to the original content. Closes: https://github.com/damus-io/damus/issues/1996 Lightning-address: kernelkind@getalby.com Signed-off-by: kernelkind <kernelkind@gmail.com> Reviewed-by: William Casarin <jb55@jb55.com> Link: 20240214032018.57812-1-kernelkind@gmail.com Signed-off-by: William Casarin <jb55@jb55.com>
This commit is contained in:
parent
90180202b6
commit
58326f679e
@ -627,6 +627,7 @@
|
||||
D7FF94002AC7AC5300FD969D /* RelayURL.swift in Sources */ = {isa = PBXBuildFile; fileRef = D7FF93FF2AC7AC5200FD969D /* RelayURL.swift */; };
|
||||
E02B54182B4DFADA0077FF42 /* Bech32ObjectTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = E02B54172B4DFADA0077FF42 /* Bech32ObjectTests.swift */; };
|
||||
E04A37C62B544F090029650D /* URIParsing.swift in Sources */ = {isa = PBXBuildFile; fileRef = E04A37C52B544F090029650D /* URIParsing.swift */; };
|
||||
E0E024112B7C19C20075735D /* TranslationTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = E0E024102B7C19C20075735D /* TranslationTests.swift */; };
|
||||
E4FA1C032A24BB7F00482697 /* SearchSettingsView.swift in Sources */ = {isa = PBXBuildFile; fileRef = E4FA1C022A24BB7F00482697 /* SearchSettingsView.swift */; };
|
||||
E990020F2955F837003BBC5A /* EditMetadataView.swift in Sources */ = {isa = PBXBuildFile; fileRef = E990020E2955F837003BBC5A /* EditMetadataView.swift */; };
|
||||
E9E4ED0B295867B900DD7078 /* ThreadView.swift in Sources */ = {isa = PBXBuildFile; fileRef = E9E4ED0A295867B900DD7078 /* ThreadView.swift */; };
|
||||
@ -1402,6 +1403,7 @@
|
||||
D7FF93FF2AC7AC5200FD969D /* RelayURL.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = RelayURL.swift; sourceTree = "<group>"; };
|
||||
E02B54172B4DFADA0077FF42 /* Bech32ObjectTests.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Bech32ObjectTests.swift; sourceTree = "<group>"; };
|
||||
E04A37C52B544F090029650D /* URIParsing.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = URIParsing.swift; sourceTree = "<group>"; };
|
||||
E0E024102B7C19C20075735D /* TranslationTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TranslationTests.swift; sourceTree = "<group>"; };
|
||||
E4FA1C022A24BB7F00482697 /* SearchSettingsView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SearchSettingsView.swift; sourceTree = "<group>"; };
|
||||
E990020E2955F837003BBC5A /* EditMetadataView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = EditMetadataView.swift; sourceTree = "<group>"; };
|
||||
E9E4ED0A295867B900DD7078 /* ThreadView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ThreadView.swift; sourceTree = "<group>"; };
|
||||
@ -2504,6 +2506,7 @@
|
||||
D72A2CFF2AD9B66B002AFF62 /* EventViewTests.swift */,
|
||||
D7315A2B2ACDF4DA0036E30A /* DamusCacheManagerTests.swift */,
|
||||
B501062C2B363036003874F5 /* AuthIntegrationTests.swift */,
|
||||
E0E024102B7C19C20075735D /* TranslationTests.swift */,
|
||||
);
|
||||
path = damusTests;
|
||||
sourceTree = "<group>";
|
||||
@ -3443,6 +3446,7 @@
|
||||
D7315A2C2ACDF4DA0036E30A /* DamusCacheManagerTests.swift in Sources */,
|
||||
4C9054852A6AEAA000811EEC /* NdbTests.swift in Sources */,
|
||||
75AD872B2AA23A460085EF2C /* Block+Tests.swift in Sources */,
|
||||
E0E024112B7C19C20075735D /* TranslationTests.swift in Sources */,
|
||||
F944F56E29EA9CCC0067B3BF /* DamusParseContentTests.swift in Sources */,
|
||||
3A5E47C72A4A76C800C0D090 /* TrieTests.swift in Sources */,
|
||||
B501062D2B363036003874F5 /* AuthIntegrationTests.swift in Sources */,
|
||||
|
@ -21,6 +21,8 @@ enum TranslateStatus: Equatable {
|
||||
case not_needed
|
||||
}
|
||||
|
||||
fileprivate let MIN_UNIQUE_CHARS = 2
|
||||
|
||||
struct TranslateView: View {
|
||||
let damus_state: DamusState
|
||||
let event: NostrEvent
|
||||
@ -107,6 +109,10 @@ struct TranslateView: View {
|
||||
attempt_translation()
|
||||
}
|
||||
}
|
||||
|
||||
func translationMeetsStringDistanceRequirements(original: String, translated: String) -> Bool {
|
||||
return levenshteinDistanceIsGreaterThanOrEqualTo(from: original, to: translated, threshold: MIN_UNIQUE_CHARS)
|
||||
}
|
||||
}
|
||||
|
||||
extension View {
|
||||
@ -141,6 +147,10 @@ func translate_note(profiles: Profiles, keypair: Keypair, event: NostrEvent, set
|
||||
// if its the same, give up and don't retry
|
||||
return .not_needed
|
||||
}
|
||||
|
||||
guard translationMeetsStringDistanceRequirements(original: originalContent, translated: translated_note) else {
|
||||
return .not_needed
|
||||
}
|
||||
|
||||
// Render translated note
|
||||
let translated_blocks = parse_note_content(content: .content(translated_note, event.tags))
|
||||
@ -158,3 +168,50 @@ func current_language() -> String {
|
||||
}
|
||||
}
|
||||
|
||||
func levenshteinDistanceIsGreaterThanOrEqualTo(from source: String, to target: String, threshold: Int) -> Bool {
|
||||
let sourceCount = source.count
|
||||
let targetCount = target.count
|
||||
|
||||
// Early return if the difference in lengths is already greater than or equal to the threshold,
|
||||
// indicating the edit distance meets the condition without further calculation.
|
||||
if abs(sourceCount - targetCount) >= threshold {
|
||||
return true
|
||||
}
|
||||
|
||||
var matrix = [[Int]](repeating: [Int](repeating: 0, count: targetCount + 1), count: sourceCount + 1)
|
||||
|
||||
for i in 0...sourceCount {
|
||||
matrix[i][0] = i
|
||||
}
|
||||
|
||||
for j in 0...targetCount {
|
||||
matrix[0][j] = j
|
||||
}
|
||||
|
||||
for i in 1...sourceCount {
|
||||
var rowMin = Int.max
|
||||
for j in 1...targetCount {
|
||||
let sourceIndex = source.index(source.startIndex, offsetBy: i - 1)
|
||||
let targetIndex = target.index(target.startIndex, offsetBy: j - 1)
|
||||
|
||||
let cost = source[sourceIndex] == target[targetIndex] ? 0 : 1
|
||||
matrix[i][j] = min(
|
||||
matrix[i - 1][j] + 1, // Deletion
|
||||
matrix[i][j - 1] + 1, // Insertion
|
||||
matrix[i - 1][j - 1] + cost // Substitution
|
||||
)
|
||||
rowMin = min(rowMin, matrix[i][j])
|
||||
}
|
||||
// If the minimum edit distance found in any row is already greater than or equal to the threshold,
|
||||
// you can conclude the edit distance meets the criteria.
|
||||
if rowMin >= threshold {
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return matrix[sourceCount][targetCount] >= threshold
|
||||
}
|
||||
|
||||
func translationMeetsStringDistanceRequirements(original: String, translated: String) -> Bool {
|
||||
return levenshteinDistanceIsGreaterThanOrEqualTo(from: original, to: translated, threshold: MIN_UNIQUE_CHARS)
|
||||
}
|
||||
|
58
damusTests/TranslationTests.swift
Normal file
58
damusTests/TranslationTests.swift
Normal file
@ -0,0 +1,58 @@
|
||||
//
|
||||
// TranslationTests.swift
|
||||
// damusTests
|
||||
//
|
||||
// Created by KernelKind on 2/13/24.
|
||||
//
|
||||
|
||||
import XCTest
|
||||
@testable import damus
|
||||
|
||||
final class TranslationTests : XCTestCase {
|
||||
let translationStringDistanceCases = [
|
||||
("test", "test ", false),
|
||||
("wat", "what", false),
|
||||
("wat's the wether like", "what's the weather like", true),
|
||||
("GM GZY⚡️\n\redacted 🍆🦪🤙 https://video.nostr.build/7dadcc39e83cbc37c99fabb883314f29c169c1bd994f1d525bde6e9817facc85.mp4 ", "GM GZY⚡️\n\redacted 🍆🦪🤙 https://video.nostr.build/7dadcc39e83cbc37c99fabb883314f29c169c1bd994f1d525bde6e9817facc85.mp4", false),
|
||||
("Fucking nostr forever typo’s lol 😂", "Fucking nostr forever typo's lol 😂", false),
|
||||
("where's the library", "donde esta la libreria", true),
|
||||
("In America", "En América", true)
|
||||
]
|
||||
|
||||
func testStringDistanceRequirements() {
|
||||
for (original, translated, expectedVal) in translationStringDistanceCases {
|
||||
XCTAssertEqual(translationMeetsStringDistanceRequirements(original: original, translated: translated), expectedVal)
|
||||
}
|
||||
}
|
||||
|
||||
let levenshteinDistanceCases = [
|
||||
// (original string, mutated string, number of changes from original to mutated)
|
||||
("hello", "hello", 0), // No change
|
||||
("123", "1234", 1), // Addition at the end
|
||||
("abcd", "abcde", 1), // Addition at the end
|
||||
("abc", "a", 2), // Multiple deletions
|
||||
("abcdef", "abc", 3), // Multiple deletions
|
||||
("2024", "2025", 1), // Single substitution
|
||||
("openai", "opnai", 1), // Single deletion
|
||||
("swift", "swiift", 1), // Single addition
|
||||
("language", "languag", 1), // Single deletion at the end
|
||||
("example", "sxample", 1), // Single substitution at the beginning
|
||||
("distance", "d1stanc3", 2), // Substitutions
|
||||
("python", "pyth0n", 1), // Single substitution
|
||||
("algorithm", "algor1thm", 1), // Single substitution in the middle
|
||||
("implementation", "implemenation", 1), // Single deletion (typo)
|
||||
("correction", "correctionn", 1), // Single addition at the end
|
||||
("levenshtein", "levenshtien", 2), // Transposition
|
||||
("threshold", "threshhold", 1), // Single addition (double letter)
|
||||
("functionality", "fuctionality", 1), // Single deletion (common typo)
|
||||
("assessment", "assesment", 1), // Single deletion (common typo)
|
||||
("performance", "performence", 1), // Single substitution (common typo)
|
||||
]
|
||||
|
||||
func testLevenshteinDistance() {
|
||||
for (original, mutated, numChanges) in levenshteinDistanceCases {
|
||||
XCTAssertTrue(levenshteinDistanceIsGreaterThanOrEqualTo(from: original, to: mutated, threshold: numChanges))
|
||||
XCTAssertFalse(levenshteinDistanceIsGreaterThanOrEqualTo(from: original, to: mutated, threshold: numChanges+1))
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user