Skip to content

Commit ce59900

Browse files
authored
LT-22261: Problems with copied text when importing from flextext (#488)
* Fix LT-22261: import word categories using flextext * Add unit tests for importing categories * Fix bugs with matching analyses * Fix bugs in code used when FindOrCreateWfiAnalysis fails * Fix unit tests * Improve conditional on FindMatchingAnalysis
1 parent 8fe045d commit ce59900

File tree

2 files changed

+270
-28
lines changed

2 files changed

+270
-28
lines changed

Src/LexText/Interlinear/BIRDInterlinearImporter.cs

Lines changed: 198 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -769,48 +769,164 @@ private static IAnalysis CreateWordformWithWfiAnalysis(LcmCache cache, Word word
769769

770770
if (itemDict.ContainsKey("cf")) // Lex. Entries
771771
{
772+
// NB: "cf" records the lexeme, not the headword/citation form (in spite of the name).
772773
int ws_cf = GetWsEngine(wsFact, itemDict["cf"].Item1).Handle;
773774
ILexEntry entry = null;
774775
var entries = lex_entry_repo.AllInstances().Where(
775-
m => StringServices.CitationFormWithAffixTypeStaticForWs(m, ws_cf, string.Empty) == itemDict["cf"].Item2);
776-
if (entries.Count() == 1)
776+
m => DecorateFormWithAffixMarkers(m.LexemeFormOA?.MorphTypeRA, m.LexemeFormOA?.Form?.get_String(ws_cf)?.Text) == itemDict["cf"].Item2);
777+
778+
// Filter entries by homograph number.
779+
// If the lexeme and the headword are different,
780+
// then there may be more than one entry with the given homograph number.
781+
// This is because homograph numbers distinguish headwords rather than lexemes.
782+
// If there is no "hn" entry, then the hn is 0.
783+
string hn = "0";
784+
if (itemDict.ContainsKey("hn")) // Homograph Number
777785
{
778-
entry = entries.First();
786+
hn = itemDict["hn"].Item2;
779787
}
780-
else if (itemDict.ContainsKey("hn")) // Homograph Number
788+
var hnEntries = entries.Where(m => m.HomographNumber.ToString() == hn);
789+
if (hnEntries.Count() > 0)
781790
{
782-
entry = entries.FirstOrDefault(m => m.HomographNumber.ToString() == itemDict["hn"].Item2);
791+
entries = hnEntries;
783792
}
784-
if (entry != null)
785-
{
786-
bundle.MorphRA = entry.LexemeFormOA;
787793

788-
if (itemDict.ContainsKey("gls")) // Lex. Gloss
794+
if (itemDict.ContainsKey("gls")) // Lex. Gloss
795+
{
796+
// Filter senses by gloss.
797+
int ws_gls = GetWsEngine(wsFact, itemDict["gls"].Item1).Handle;
798+
IList<ILexSense> senses = new List<ILexSense>();
799+
foreach (var e in entries)
800+
{
801+
senses.AddRange(e.SensesOS.Where(s => s.Gloss.get_String(ws_gls).Text == itemDict["gls"].Item2));
802+
}
803+
if (senses.Count() > 1 && itemDict.ContainsKey("msa"))
789804
{
790-
int ws_gls = GetWsEngine(wsFact, itemDict["gls"].Item1).Handle;
791-
ILexSense sense = entry.SensesOS.FirstOrDefault(s => s.Gloss.get_String(ws_gls).Text == itemDict["gls"].Item2);
792-
if (sense != null)
805+
// Filter senses by MSA.
806+
IList<ILexSense> msaSenses = senses.Where(s => s.MorphoSyntaxAnalysisRA?.InterlinearAbbr == itemDict["msa"].Item2).ToList();
807+
if (msaSenses.Count() > 0)
793808
{
794-
bundle.SenseRA = sense;
809+
senses = msaSenses;
795810
}
796811
}
812+
// Record sense.
813+
if (senses.Count() > 0)
814+
{
815+
bundle.SenseRA = senses.FirstOrDefault();
816+
entry = bundle.SenseRA.Entry;
817+
}
818+
}
819+
820+
if (entry == null && entries.Count() > 0)
821+
{
822+
entry = entries.First();
823+
}
824+
825+
// Record morpheme.
826+
if (entry != null)
827+
{
828+
if (itemDict.ContainsKey("txt"))
829+
{
830+
// Try allomorph first.
831+
var ws_txt = GetWsEngine(wsFact, itemDict["txt"].Item1).Handle;
832+
bundle.MorphRA = entry.AllAllomorphs.Where(
833+
m => DecorateFormWithAffixMarkers(m.MorphTypeRA, m.Form.get_String(ws_txt).Text) == itemDict["txt"].Item2).FirstOrDefault();
834+
}
835+
if (bundle.MorphRA == null)
836+
{
837+
bundle.MorphRA = entry.LexemeFormOA;
838+
}
797839
}
798840
}
799841

800842
if (itemDict.ContainsKey("msa")) // Lex. Gram. Info
801843
{
802-
IMoMorphSynAnalysis match = msa_repo.AllInstances().FirstOrDefault(m => m.InterlinearAbbr == itemDict["msa"].Item2);
803-
if (match != null)
844+
if (bundle.SenseRA != null && bundle.SenseRA.MorphoSyntaxAnalysisRA?.InterlinearAbbr == itemDict["msa"].Item2)
845+
{
846+
bundle.MsaRA = bundle.SenseRA.MorphoSyntaxAnalysisRA;
847+
}
848+
else
849+
{
850+
IMoMorphSynAnalysis match = msa_repo.AllInstances().FirstOrDefault(m => m.InterlinearAbbr == itemDict["msa"].Item2);
851+
if (match != null)
852+
{
853+
bundle.MsaRA = match;
854+
}
855+
}
856+
}
857+
}
858+
}
859+
860+
// Try to fill in category.
861+
if (word.Items != null && wordForm.Analysis != null)
862+
{
863+
// Look for an existing category that matches a "pos".
864+
bool hasPOS = false;
865+
foreach (var item in word.Items)
866+
{
867+
if (wordForm.Analysis.CategoryRA != null)
868+
{
869+
// Category filled in.
870+
break;
871+
}
872+
if (item.type == "pos")
873+
{
874+
hasPOS = true;
875+
ILgWritingSystem writingSystem = GetWsEngine(cache.WritingSystemFactory, item.lang);
876+
if (writingSystem != null)
877+
{
878+
foreach (var cat in cache.LanguageProject.AllPartsOfSpeech)
879+
{
880+
if (MatchesCatNameOrAbbreviation(writingSystem.Handle, item.Value, cat))
881+
{
882+
wordForm.Analysis.CategoryRA = cat;
883+
break;
884+
}
885+
}
886+
}
887+
}
888+
}
889+
if (hasPOS && wordForm.Analysis.CategoryRA == null)
890+
{
891+
// Create a new category.
892+
IPartOfSpeech cat = cache.ServiceLocator.GetInstance<IPartOfSpeechFactory>().Create();
893+
cache.LanguageProject.PartsOfSpeechOA.PossibilitiesOS.Add(cat);
894+
foreach (var item in word.Items)
895+
{
896+
if (item.type == "pos")
804897
{
805-
bundle.MsaRA = match;
898+
ILgWritingSystem writingSystem = GetWsEngine(cache.WritingSystemFactory, item.lang);
899+
if (writingSystem != null)
900+
{
901+
cat.Name.set_String(writingSystem.Handle, item.Value);
902+
cat.Abbreviation.set_String(writingSystem.Handle, item.Value);
903+
}
806904
}
807905
}
906+
wordForm.Analysis.CategoryRA = cat;
808907
}
809908
}
810909

811910
return wordForm;
812911
}
813912

913+
// Based on StringServices.DecorateFormWithAffixMarkers.
914+
private static string DecorateFormWithAffixMarkers(IMoMorphType mmt, string form)
915+
{
916+
if (mmt == null || form == null)
917+
return form;
918+
// Add pre- post markers, if any.
919+
if (!String.IsNullOrEmpty(mmt.Prefix))
920+
{
921+
form = mmt.Prefix + form;
922+
}
923+
if (!String.IsNullOrEmpty(mmt.Postfix))
924+
{
925+
form = form + mmt.Postfix;
926+
}
927+
return form;
928+
}
929+
814930
private static bool FindOrCreateWfiAnalysis(LcmCache cache, Word word,
815931
int mainWritingSystem,
816932
out IAnalysis analysis)
@@ -820,6 +936,7 @@ private static bool FindOrCreateWfiAnalysis(LcmCache cache, Word word,
820936
// First, collect all expected forms and glosses from the Word
821937
var expectedForms = new Dictionary<int, string>(); // wsHandle -> expected value
822938
var expectedGlosses = new Dictionary<int, string>(); // wsHandle -> expected gloss
939+
var expectedCats = new Dictionary<int, string>(); // wsHandle -> expected cat
823940
IAnalysis candidateForm = null;
824941
ITsString wordForm = null;
825942
ITsString punctForm = null;
@@ -871,6 +988,10 @@ private static bool FindOrCreateWfiAnalysis(LcmCache cache, Word word,
871988

872989
expectedGlosses[ws.Handle] = wordItem.Value;
873990
break;
991+
992+
case "pos":
993+
expectedCats[ws.Handle] = wordItem.Value;
994+
break;
874995
}
875996
}
876997

@@ -896,23 +1017,57 @@ private static bool FindOrCreateWfiAnalysis(LcmCache cache, Word word,
8961017
return true;
8971018
}
8981019

1020+
analysis = FindMatchingAnalysis(cache, candidateWordform, word, expectedGlosses, expectedCats);
1021+
if (analysis != null)
1022+
{
1023+
return true;
1024+
}
1025+
1026+
if (wordForm.Text.ToLower() != wordForm.Text)
1027+
{
1028+
// Try lowercase.
1029+
var lcCandidateForm = cache.ServiceLocator
1030+
.GetInstance<IWfiWordformRepository>()
1031+
.GetMatchingWordform(wordForm.get_WritingSystemAt(0), wordForm.Text.ToLower());
1032+
if (lcCandidateForm is IWfiWordform lcCandidateWordform)
1033+
{
1034+
analysis = FindMatchingAnalysis(cache, lcCandidateWordform, word, expectedGlosses, expectedCats);
1035+
if (analysis != null)
1036+
{
1037+
return true;
1038+
}
1039+
}
1040+
}
1041+
1042+
// No matching analysis found with all expected gloss and morpheme data
1043+
analysis = AddEmptyAnalysisToWordform(cache, candidateWordform);
1044+
return false;
1045+
}
1046+
1047+
private static IAnalysis FindMatchingAnalysis(LcmCache cache, IWfiWordform candidateWordform, Word word,
1048+
Dictionary<int, string> expectedGlosses, Dictionary<int, string> expectedCats)
1049+
{
1050+
IAnalysis analysis = null;
1051+
var wsFact = cache.WritingSystemFactory;
8991052
// Look for an analysis that has the correct morphemes and a matching gloss
9001053
foreach (var wfiAnalysis in candidateWordform.AnalysesOC)
9011054
{
9021055
var morphemeMatch = true;
9031056
// verify that the analysis has a Morph Bundle with the expected morphemes from the import
904-
if (word.morphemes != null && wfiAnalysis.MorphBundlesOS.Count == word.morphemes?.morphs.Length)
1057+
if (word.morphemes != null && wfiAnalysis.MorphBundlesOS.Count == word.morphemes?.morphs.Length &&
1058+
word.morphemes.analysisStatus == analysisStatusTypes.humanApproved)
9051059
{
9061060
analysis = GetMostSpecificAnalysisForWordForm(wfiAnalysis);
907-
for(var i = 0; i < wfiAnalysis.MorphBundlesOS.Count; ++i)
1061+
for (var i = 0; i < wfiAnalysis.MorphBundlesOS.Count; ++i)
9081062
{
909-
var extantMorphForm = wfiAnalysis.MorphBundlesOS[i].Form;
1063+
var morphBundle = wfiAnalysis.MorphBundlesOS[i];
1064+
var extantMorphForm = morphBundle.Form;
9101065
var importMorphForm = word.morphemes.morphs[i].items.FirstOrDefault(item => item.type == "txt");
9111066
var importFormWs = GetWsEngine(wsFact, importMorphForm?.lang);
9121067
// compare the import item to the extant morph form
9131068
if (importMorphForm == null || extantMorphForm == null ||
9141069
TsStringUtils.IsNullOrEmpty(extantMorphForm.get_String(importFormWs.Handle)) ||
915-
!extantMorphForm.get_String(importFormWs.Handle).Text.Normalize()
1070+
!DecorateFormWithAffixMarkers(morphBundle.MorphRA?.MorphTypeRA, extantMorphForm.get_String(importFormWs.Handle).Text).Normalize()
9161071
.Equals(importMorphForm.Value?.Normalize()))
9171072
{
9181073
morphemeMatch = false;
@@ -923,18 +1078,14 @@ private static bool FindOrCreateWfiAnalysis(LcmCache cache, Word word,
9231078

9241079
if (morphemeMatch)
9251080
{
926-
var matchingGloss = wfiAnalysis.MeaningsOC.FirstOrDefault(g => VerifyGlossesMatch(g, expectedGlosses));
1081+
var matchingGloss = wfiAnalysis.MeaningsOC.FirstOrDefault(g => VerifyGlossesMatch(g, expectedGlosses, expectedCats));
9271082
if (matchingGloss != null)
9281083
{
929-
analysis = matchingGloss;
930-
return true;
1084+
return matchingGloss;
9311085
}
9321086
}
9331087
}
934-
935-
// No matching analysis found with all expected gloss and morpheme data
936-
analysis = AddEmptyAnalysisToWordform(cache, candidateWordform);
937-
return false;
1088+
return null;
9381089
}
9391090

9401091
private static IAnalysis GetMostSpecificAnalysisForWordForm(IAnalysis candidateWordform)
@@ -1031,7 +1182,8 @@ private static bool MatchPrimaryFormAndAddMissingAlternatives(IAnalysis wordForm
10311182

10321183
// Helper method to verify that all expected glosses match the stored glosses
10331184
private static bool VerifyGlossesMatch(IWfiGloss wfiGloss,
1034-
Dictionary<int, string> expectedGlosses)
1185+
Dictionary<int, string> expectedGlosses,
1186+
Dictionary<int, string> expectedCats)
10351187
{
10361188
foreach (var expectedGloss in expectedGlosses)
10371189
{
@@ -1042,10 +1194,28 @@ private static bool VerifyGlossesMatch(IWfiGloss wfiGloss,
10421194
if (storedGloss == null || storedGloss.Text != expectedValue)
10431195
return false; // Mismatch found
10441196
}
1197+
foreach (var expectedCat in expectedCats)
1198+
{
1199+
if (!MatchesCatNameOrAbbreviation(expectedCat.Key, expectedCat.Value, wfiGloss.Analysis?.CategoryRA))
1200+
return false;
1201+
}
10451202

10461203
return true;
10471204
}
10481205

1206+
private static bool MatchesCatNameOrAbbreviation(int ws, string text, IPartOfSpeech cat)
1207+
{
1208+
if (cat == null)
1209+
return false;
1210+
ITsString name = cat.Name.get_String(ws);
1211+
if (name != null && name.Text == text)
1212+
return true;
1213+
ITsString abbr = cat.Abbreviation.get_String(ws);
1214+
if (abbr != null && abbr.Text == text)
1215+
return true;
1216+
return false;
1217+
}
1218+
10491219
/// <summary>
10501220
/// </summary>
10511221
/// <param name="wordForm">The word Gloss. If multiple glosses, returns the last one created.</param>

0 commit comments

Comments
 (0)