1
0
mirror of git://jb55.com/damus synced 2024-09-28 16:00:43 +00:00

translate: implement string distance for close matches

Implement the levenshtein string distance algorithm for determining
if the translation is too similar to the original content.

Closes: https://github.com/damus-io/damus/issues/1996

Lightning-address: kernelkind@getalby.com
Signed-off-by: kernelkind <kernelkind@gmail.com>
Reviewed-by: William Casarin <jb55@jb55.com>
Link: 20240214032018.57812-1-kernelkind@gmail.com
Signed-off-by: William Casarin <jb55@jb55.com>
This commit is contained in:
kernelkind 2024-02-13 22:20:18 -05:00 committed by William Casarin
parent 90180202b6
commit 58326f679e
3 changed files with 119 additions and 0 deletions

View File

@ -627,6 +627,7 @@
D7FF94002AC7AC5300FD969D /* RelayURL.swift in Sources */ = {isa = PBXBuildFile; fileRef = D7FF93FF2AC7AC5200FD969D /* RelayURL.swift */; };
E02B54182B4DFADA0077FF42 /* Bech32ObjectTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = E02B54172B4DFADA0077FF42 /* Bech32ObjectTests.swift */; };
E04A37C62B544F090029650D /* URIParsing.swift in Sources */ = {isa = PBXBuildFile; fileRef = E04A37C52B544F090029650D /* URIParsing.swift */; };
E0E024112B7C19C20075735D /* TranslationTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = E0E024102B7C19C20075735D /* TranslationTests.swift */; };
E4FA1C032A24BB7F00482697 /* SearchSettingsView.swift in Sources */ = {isa = PBXBuildFile; fileRef = E4FA1C022A24BB7F00482697 /* SearchSettingsView.swift */; };
E990020F2955F837003BBC5A /* EditMetadataView.swift in Sources */ = {isa = PBXBuildFile; fileRef = E990020E2955F837003BBC5A /* EditMetadataView.swift */; };
E9E4ED0B295867B900DD7078 /* ThreadView.swift in Sources */ = {isa = PBXBuildFile; fileRef = E9E4ED0A295867B900DD7078 /* ThreadView.swift */; };
@ -1402,6 +1403,7 @@
D7FF93FF2AC7AC5200FD969D /* RelayURL.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = RelayURL.swift; sourceTree = "<group>"; };
E02B54172B4DFADA0077FF42 /* Bech32ObjectTests.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = Bech32ObjectTests.swift; sourceTree = "<group>"; };
E04A37C52B544F090029650D /* URIParsing.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = URIParsing.swift; sourceTree = "<group>"; };
E0E024102B7C19C20075735D /* TranslationTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = TranslationTests.swift; sourceTree = "<group>"; };
E4FA1C022A24BB7F00482697 /* SearchSettingsView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SearchSettingsView.swift; sourceTree = "<group>"; };
E990020E2955F837003BBC5A /* EditMetadataView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = EditMetadataView.swift; sourceTree = "<group>"; };
E9E4ED0A295867B900DD7078 /* ThreadView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ThreadView.swift; sourceTree = "<group>"; };
@ -2504,6 +2506,7 @@
D72A2CFF2AD9B66B002AFF62 /* EventViewTests.swift */,
D7315A2B2ACDF4DA0036E30A /* DamusCacheManagerTests.swift */,
B501062C2B363036003874F5 /* AuthIntegrationTests.swift */,
E0E024102B7C19C20075735D /* TranslationTests.swift */,
);
path = damusTests;
sourceTree = "<group>";
@ -3443,6 +3446,7 @@
D7315A2C2ACDF4DA0036E30A /* DamusCacheManagerTests.swift in Sources */,
4C9054852A6AEAA000811EEC /* NdbTests.swift in Sources */,
75AD872B2AA23A460085EF2C /* Block+Tests.swift in Sources */,
E0E024112B7C19C20075735D /* TranslationTests.swift in Sources */,
F944F56E29EA9CCC0067B3BF /* DamusParseContentTests.swift in Sources */,
3A5E47C72A4A76C800C0D090 /* TrieTests.swift in Sources */,
B501062D2B363036003874F5 /* AuthIntegrationTests.swift in Sources */,

View File

@ -21,6 +21,8 @@ enum TranslateStatus: Equatable {
case not_needed
}
fileprivate let MIN_UNIQUE_CHARS = 2
struct TranslateView: View {
let damus_state: DamusState
let event: NostrEvent
@ -107,6 +109,10 @@ struct TranslateView: View {
attempt_translation()
}
}
func translationMeetsStringDistanceRequirements(original: String, translated: String) -> Bool {
return levenshteinDistanceIsGreaterThanOrEqualTo(from: original, to: translated, threshold: MIN_UNIQUE_CHARS)
}
}
extension View {
@ -141,6 +147,10 @@ func translate_note(profiles: Profiles, keypair: Keypair, event: NostrEvent, set
// if its the same, give up and don't retry
return .not_needed
}
guard translationMeetsStringDistanceRequirements(original: originalContent, translated: translated_note) else {
return .not_needed
}
// Render translated note
let translated_blocks = parse_note_content(content: .content(translated_note, event.tags))
@ -158,3 +168,50 @@ func current_language() -> String {
}
}
func levenshteinDistanceIsGreaterThanOrEqualTo(from source: String, to target: String, threshold: Int) -> Bool {
let sourceCount = source.count
let targetCount = target.count
// Early return if the difference in lengths is already greater than or equal to the threshold,
// indicating the edit distance meets the condition without further calculation.
if abs(sourceCount - targetCount) >= threshold {
return true
}
var matrix = [[Int]](repeating: [Int](repeating: 0, count: targetCount + 1), count: sourceCount + 1)
for i in 0...sourceCount {
matrix[i][0] = i
}
for j in 0...targetCount {
matrix[0][j] = j
}
for i in 1...sourceCount {
var rowMin = Int.max
for j in 1...targetCount {
let sourceIndex = source.index(source.startIndex, offsetBy: i - 1)
let targetIndex = target.index(target.startIndex, offsetBy: j - 1)
let cost = source[sourceIndex] == target[targetIndex] ? 0 : 1
matrix[i][j] = min(
matrix[i - 1][j] + 1, // Deletion
matrix[i][j - 1] + 1, // Insertion
matrix[i - 1][j - 1] + cost // Substitution
)
rowMin = min(rowMin, matrix[i][j])
}
// If the minimum edit distance found in any row is already greater than or equal to the threshold,
// you can conclude the edit distance meets the criteria.
if rowMin >= threshold {
return true
}
}
return matrix[sourceCount][targetCount] >= threshold
}
func translationMeetsStringDistanceRequirements(original: String, translated: String) -> Bool {
return levenshteinDistanceIsGreaterThanOrEqualTo(from: original, to: translated, threshold: MIN_UNIQUE_CHARS)
}

View File

@ -0,0 +1,58 @@
//
// TranslationTests.swift
// damusTests
//
// Created by KernelKind on 2/13/24.
//
import XCTest
@testable import damus
final class TranslationTests : XCTestCase {
let translationStringDistanceCases = [
("test", "test ", false),
("wat", "what", false),
("wat's the wether like", "what's the weather like", true),
("GM GZY⚡\n\redacted 🍆🦪🤙 https://video.nostr.build/7dadcc39e83cbc37c99fabb883314f29c169c1bd994f1d525bde6e9817facc85.mp4 ", "GM GZY⚡\n\redacted 🍆🦪🤙 https://video.nostr.build/7dadcc39e83cbc37c99fabb883314f29c169c1bd994f1d525bde6e9817facc85.mp4", false),
("Fucking nostr forever typos lol 😂", "Fucking nostr forever typo's lol 😂", false),
("where's the library", "donde esta la libreria", true),
("In America", "En América", true)
]
func testStringDistanceRequirements() {
for (original, translated, expectedVal) in translationStringDistanceCases {
XCTAssertEqual(translationMeetsStringDistanceRequirements(original: original, translated: translated), expectedVal)
}
}
let levenshteinDistanceCases = [
// (original string, mutated string, number of changes from original to mutated)
("hello", "hello", 0), // No change
("123", "1234", 1), // Addition at the end
("abcd", "abcde", 1), // Addition at the end
("abc", "a", 2), // Multiple deletions
("abcdef", "abc", 3), // Multiple deletions
("2024", "2025", 1), // Single substitution
("openai", "opnai", 1), // Single deletion
("swift", "swiift", 1), // Single addition
("language", "languag", 1), // Single deletion at the end
("example", "sxample", 1), // Single substitution at the beginning
("distance", "d1stanc3", 2), // Substitutions
("python", "pyth0n", 1), // Single substitution
("algorithm", "algor1thm", 1), // Single substitution in the middle
("implementation", "implemenation", 1), // Single deletion (typo)
("correction", "correctionn", 1), // Single addition at the end
("levenshtein", "levenshtien", 2), // Transposition
("threshold", "threshhold", 1), // Single addition (double letter)
("functionality", "fuctionality", 1), // Single deletion (common typo)
("assessment", "assesment", 1), // Single deletion (common typo)
("performance", "performence", 1), // Single substitution (common typo)
]
func testLevenshteinDistance() {
for (original, mutated, numChanges) in levenshteinDistanceCases {
XCTAssertTrue(levenshteinDistanceIsGreaterThanOrEqualTo(from: original, to: mutated, threshold: numChanges))
XCTAssertFalse(levenshteinDistanceIsGreaterThanOrEqualTo(from: original, to: mutated, threshold: numChanges+1))
}
}
}