diff --git a/rudict/grammar_case.csv b/rudict/grammar_case.csv index ca9bdfe..f6ff90c 100644 --- a/rudict/grammar_case.csv +++ b/rudict/grammar_case.csv @@ -143,6 +143,30 @@ SECOND_MALE_SSFORM_ANIMATE;NC_SINGULAR;NGC_P1_NOMINATIVE;ь;парень ;;NGC_P4_ACCUSATIVE;ей;обвиняю парней ;;NGC_P5_INSTRUMENTAL;ями;говорю с парнями ;;NGC_P6_PREPOSITIONAL;ях;говорю о парнях +SECOND_I_SHORT_INANIMATE;NC_SINGULAR;NGC_P1_NOMINATIVE;й;каравай +;;NGC_P2_GENITIVE;я;подарок для каравая +;;NGC_P3_DATIVE;ю;подарок караваю +;;NGC_P4_ACCUSATIVE;й;обвиняю каравай +;;NGC_P5_INSTRUMENTAL;ем;говорю с караваем +;;NGC_P6_PREPOSITIONAL;е;говорю о каравае +;NC_PLURAL;NGC_P1_NOMINATIVE;и;караваи +;;NGC_P2_GENITIVE;ев;подарок для караваев +;;NGC_P3_DATIVE;ям;подарок караваям +;;NGC_P4_ACCUSATIVE;и;обвиняю караваи +;;NGC_P5_INSTRUMENTAL;ями;говорю с караваями +;;NGC_P6_PREPOSITIONAL;ях;говорю о караваях +SECOND_I_SHORT_ANIMATE;NC_SINGULAR;NGC_P1_NOMINATIVE;й;герой +;;NGC_P2_GENITIVE;я;подарок для героя +;;NGC_P3_DATIVE;ю;подарок герою +;;NGC_P4_ACCUSATIVE;я;обвиняю героя +;;NGC_P5_INSTRUMENTAL;ем;говорю с героем +;;NGC_P6_PREPOSITIONAL;е;говорю о каравае +;NC_PLURAL;NGC_P1_NOMINATIVE;и;герои +;;NGC_P2_GENITIVE;ев;подарок для героев +;;NGC_P3_DATIVE;ям;подарок героям +;;NGC_P4_ACCUSATIVE;ев;обвиняю героев +;;NGC_P5_INSTRUMENTAL;ями;говорю с героями +;;NGC_P6_PREPOSITIONAL;ях;говорю о героях SECOND_NEUTRAL_E_FORM;NC_SINGULAR;NGC_P1_NOMINATIVE;е;поле ;;NGC_P2_GENITIVE;я;подарок для поля ;;NGC_P3_DATIVE;ю;подарок полю diff --git a/rudict/grammar_case.xlsx b/rudict/grammar_case.xlsx index ffc9621..9c40441 100644 Binary files a/rudict/grammar_case.xlsx and b/rudict/grammar_case.xlsx differ diff --git a/rudict/rudict/main.cpp b/rudict/rudict/main.cpp index ea6df40..f82aef6 100644 --- a/rudict/rudict/main.cpp +++ b/rudict/rudict/main.cpp @@ -12,6 +12,7 @@ int main() LoadNounDeclencionCaseTable(); LoadFrequentWordSet(); FillDivisionCaseMaps(); + CalculatePluralForm(); //RecognizeNoun(L"стульями"); //Косяк: "вечер" diff --git a/rudict/rudict/noun.cpp b/rudict/rudict/noun.cpp index fa10204..1d2a048 100644 --- a/rudict/rudict/noun.cpp +++ b/rudict/rudict/noun.cpp @@ -120,6 +120,17 @@ NounDeclencion WStringToNounDeclencion(std::wstring str) { return SECOND_MALE_SSFORM_ANIMATE; } + + if (str == L"SECOND_I_SHORT_INANIMATE") + { + return SECOND_I_SHORT_INANIMATE; + } + if (str == L"SECOND_I_SHORT_ANIMATE") + { + return SECOND_I_SHORT_ANIMATE; + } + + if (str == L"SECOND_NEUTRAL_E_FORM") { return SECOND_NEUTRAL_E_FORM; @@ -225,6 +236,10 @@ std::vector GetAllNounEndingArr() std::vector result { L"", + + L"й", + L"ев", + L"а", L"и", L"е", @@ -266,18 +281,143 @@ bool NounIsInDictionary(std::wstring nounNominative) } -bool NounSpecialPluralFormIsInDictionary(std::wstring nounNominativePlural) +std::wstring convertToStandardPluralForm(std::wstring s) { - for (auto& noun : NounRecordArr) + std::wstring pluralForm = s; + + if (pluralForm[pluralForm.size() - 1] == L'а' && charIsIFormConsolant(pluralForm[pluralForm.size() - 2])) { - if (noun.specialMultipleForm == nounNominativePlural) + pluralForm[pluralForm.size() - 1] = L'и'; + } + else if (pluralForm[pluralForm.size() - 1] == L'а' && charIsUFormConsolant(pluralForm[pluralForm.size() - 2])) + { + pluralForm[pluralForm.size() - 1] = L'ы'; + } + else if (pluralForm[pluralForm.size() - 1] == L'я') + { + pluralForm[pluralForm.size() - 1] = L'и'; + } + else if (charIsIFormConsolant(pluralForm[pluralForm.size() - 1])) + { + pluralForm += L'и'; + } + else if (charIsUFormConsolant(pluralForm[pluralForm.size() - 1])) + { + pluralForm += L'ы'; + } + else if (pluralForm[pluralForm.size() - 1] == L'ь') + { + pluralForm[pluralForm.size() - 1] = L'и'; + } + else if (pluralForm[pluralForm.size() - 1] == L'й') + { + pluralForm[pluralForm.size() - 1] = L'и'; + } + else if (pluralForm[pluralForm.size() - 1] == L'о') + { + pluralForm[pluralForm.size() - 1] = L'а'; + } + else if (pluralForm[pluralForm.size() - 1] == L'е') + { + pluralForm[pluralForm.size() - 1] = L'я'; + } + else + { + std::cout << "Error in convertToStandardPluralForm" << std::endl; + } + + return pluralForm; +} + +std::wstring convertFromStandardToAlternativePluralForm(std::wstring s) +{ + if (s[s.size() - 1] == L'и') + { + s[s.size() - 1] = L'я'; + } + if (s[s.size() - 1] == L'ы') + { + s[s.size() - 1] = L'а'; + } + + return s; +} + +std::set getPluralForm(NounRecord noun) +{ + std::set result; + + if (noun.specialMultipleForm != L"") + { + result.insert(noun.specialMultipleForm); + } + + if (noun.haveSingleForm) + { + if (noun.haveStandardMultipleForm) { - return true; + std::wstring pluralForm = convertToStandardPluralForm(noun.nominativeForm); + + if (noun.haveStandardMultipleFormEnding) + { + result.insert(pluralForm); + } + + if (noun.haveAlternativeMultipleFormEnding) + { + result.insert(convertFromStandardToAlternativePluralForm(pluralForm)); + } + + } + + if (noun.haveStandardMultipleFormWithMissingLastVowel) + { + std::wstring pluralForm = convertToStandardPluralForm(noun.nominativeForm); + + wchar_t prevsschar = pluralForm[pluralForm.size() - 4]; + + if (charIsMissingVowelSoftenerConsolant(prevsschar)) + { + pluralForm[pluralForm.size() - 3] = L'ь'; + } + else + { + pluralForm.erase(pluralForm.begin() + pluralForm.size() - 3); + } + + if (noun.haveStandardMultipleFormEnding) + { + result.insert(pluralForm); + } + + if (noun.haveAlternativeMultipleFormEnding) + { + result.insert(convertFromStandardToAlternativePluralForm(pluralForm)); + } } - if (!noun.haveSingleForm && noun.nominativeForm == nounNominativePlural) + + + } + else + { + result.insert(noun.nominativeForm); + } + + + return result; +} + +bool NounPluralFormIsInDictionary(std::wstring nounNominativePlural) +{ + for (auto& noun : NounRecordArr) + { + if (noun.haveMultipleForm) { - return true; + if (noun.precalculatedNominativePluralSet.count(nounNominativePlural) != 0) + { + return true; + } } } @@ -298,29 +438,26 @@ NounRecord GetNounRecordFromDictionary(std::wstring nounNominative) } -NounRecord GetNounRecordFromDictionary_BySpecialPluralForm(std::wstring nounNominativePlural) +NounRecord GetNounRecordFromDictionary_ByPluralForm(std::wstring nounNominativePlural) { for (auto& noun : NounRecordArr) { - if (noun.specialMultipleForm == nounNominativePlural) + if (noun.haveMultipleForm) { - return noun; - } - - if (!noun.haveSingleForm && noun.nominativeForm == nounNominativePlural) - { - return noun; + if (noun.precalculatedNominativePluralSet.count(nounNominativePlural) != 0) + { + return noun; + } } } - return{}; } -bool charIsConsolant(wchar_t c) +bool charIsConsolant(wchar_t c) //except й { - std::wstring consolants = L"йцкнгшщзхфвпрлджчсмтб"; + std::wstring consolants = L"цкнгшщзхфвпрлджчсмтб"; for (wchar_t ic : consolants) { @@ -381,10 +518,13 @@ std::set getPossibleNounEndingDivisionSet(std::wstring noun) { std::wstring nounBase = boost::replace_last_copy(noun, ending, ""); - if (charIsConsolant(nounBase[nounBase.size() - 1]) || nounBase[nounBase.size() - 1] == L'ь' || nounBase[nounBase.size() - 1] == L'ъ') + + + if ((charIsVowel(nounBase[nounBase.size() - 1])) || //Might be exact the й case + (charIsConsolant(nounBase[nounBase.size() - 1]) || nounBase[nounBase.size() - 1] == L'ь' || nounBase[nounBase.size() - 1] == L'ъ')) { result.insert({ nounBase, ending, NounEndingDivision::DC_COMMON }); - result.insert({ nounBase, ending, NounEndingDivision::DC_SPECIAL_PLURAL_FORM }); + //result.insert({ nounBase, ending, NounEndingDivision::DC_SPECIAL_PLURAL_FORM }); } //Check missed vowel (simple case) @@ -470,7 +610,7 @@ std::wstring GetNounNoninative(std::wstring nounBase, NounDeclencion nounDeclenc return L""; } -std::wstring i_form_consolants = L"гкжшчщ"; +std::wstring i_form_consolants = L"гхкжшчщ"; std::wstring u_form_consolants = L"бпдтвфзснмлрц"; wchar_t GetLastChar(const NounRecord& nounRecord) @@ -494,14 +634,6 @@ bool charIsUFormConsolant(wchar_t c) } -bool AIFormEndingIsCorrect(const NounRecord& nounRecord) -{ - return (GetLastChar(nounRecord) == L'а' && charIsIFormConsolant(GetPrevLastChar(nounRecord))) && nounRecord.haveSingleForm || - (GetLastChar(nounRecord) == L'и' && charIsIFormConsolant(GetPrevLastChar(nounRecord))) && !nounRecord.haveSingleForm; -} - - - bool FirstAIFormInanimateSingularCondition(const NounRecord& nounRecord) { return nounRecord.haveSingleForm && nounRecord.canBeInanimate && (GetLastChar(nounRecord) == L'а' && charIsIFormConsolant(GetPrevLastChar(nounRecord))); @@ -670,6 +802,31 @@ bool SecondMaleSSFormAnimatePluralCondition(const NounRecord& nounRecord) ); } + +bool SecondIShortInanimateSingularCondition(const NounRecord& nounRecord) +{ + return nounRecord.haveSingleForm && nounRecord.canBeInanimate && GetLastChar(nounRecord) == L'й'; +} + +bool SecondIShortAnimateSingularCondition(const NounRecord& nounRecord) +{ + return nounRecord.haveSingleForm && nounRecord.canBeAnimate && GetLastChar(nounRecord) == L'й'; +} + +bool SecondIShortInanimatePluralCondition(const NounRecord& nounRecord) +{ + return nounRecord.haveSingleForm && nounRecord.canBeInanimate && GetLastChar(nounRecord) == L'й'; +} + +bool SecondIShortAnimatePluralCondition(const NounRecord& nounRecord) +{ + return nounRecord.haveSingleForm && nounRecord.canBeAnimate && GetLastChar(nounRecord) == L'й'; +} + + + + + bool SecondNeutralEFormSingularCondition(const NounRecord& nounRecord) { return nounRecord.haveSingleForm && nounRecord.gender == NG_NEUTRAL && GetLastChar(nounRecord) == L'е'; @@ -760,6 +917,12 @@ void SetupDeclentionMap() DeclentionConditionMap[{SECOND_MALE_SSFORM_ANIMATE, NC_SINGULAR}] = std::bind(SecondMaleSSFormAnimateSingularCondition, std::placeholders::_1); DeclentionConditionMap[{SECOND_MALE_SSFORM_ANIMATE, NC_PLURAL}] = std::bind(SecondMaleSSFormAnimatePluralCondition, std::placeholders::_1); + DeclentionConditionMap[{SECOND_I_SHORT_INANIMATE, NC_SINGULAR}] = std::bind(SecondIShortInanimateSingularCondition, std::placeholders::_1); + DeclentionConditionMap[{SECOND_I_SHORT_INANIMATE, NC_PLURAL}] = std::bind(SecondIShortInanimatePluralCondition, std::placeholders::_1); + DeclentionConditionMap[{SECOND_I_SHORT_ANIMATE, NC_SINGULAR}] = std::bind(SecondIShortAnimateSingularCondition, std::placeholders::_1); + DeclentionConditionMap[{SECOND_I_SHORT_ANIMATE, NC_PLURAL}] = std::bind(SecondIShortAnimatePluralCondition, std::placeholders::_1); + + DeclentionConditionMap[{SECOND_NEUTRAL_E_FORM, NC_SINGULAR}] = std::bind(SecondNeutralEFormSingularCondition, std::placeholders::_1); DeclentionConditionMap[{SECOND_NEUTRAL_E_FORM, NC_PLURAL}] = std::bind(SecondNeutralEFormPluralCondition, std::placeholders::_1); DeclentionConditionMap[{SECOND_NEUTRAL_O_FORM, NC_SINGULAR}] = std::bind(SecondNeutralOFormSingularCondition, std::placeholders::_1); @@ -814,6 +977,8 @@ bool IsDeclencionSecondType(NounDeclencion nounDeclention) case SECOND_MALE_UFORM_ANIMATE: case SECOND_MALE_SSFORM_INANIMATE: case SECOND_MALE_SSFORM_ANIMATE: + case SECOND_I_SHORT_INANIMATE: + case SECOND_I_SHORT_ANIMATE: return true; break; default: @@ -832,6 +997,7 @@ bool IsDeclencionAnimated(NounDeclencion nounDeclention) case SECOND_MALE_IFORM_ANIMATE: case SECOND_MALE_UFORM_ANIMATE: case SECOND_MALE_SSFORM_ANIMATE: + case SECOND_I_SHORT_ANIMATE: case THIRD_FORM_ANIMATE: return true; break; @@ -888,8 +1054,6 @@ void FillDivisionCaseMaps() DivisionCaseNounModificatorMap[NounEndingDivision::DC_SPECIAL_PLURAL_A] = DivisionCaseNounModificatorMap[NounEndingDivision::DC_COMMON]; - DivisionCaseNounModificatorMap[NounEndingDivision::DC_SPECIAL_PLURAL_FORM] = DivisionCaseNounModificatorMap[NounEndingDivision::DC_COMMON]; - DivisionCaseEndingModificatorMap[NounEndingDivision::DC_COMMON] = [](std::wstring s){ return s; }; DivisionCaseEndingModificatorMap[NounEndingDivision::DC_LOST_VOWEL_O] = DivisionCaseEndingModificatorMap[NounEndingDivision::DC_COMMON]; DivisionCaseEndingModificatorMap[NounEndingDivision::DC_LOST_VOWEL_E] = DivisionCaseEndingModificatorMap[NounEndingDivision::DC_COMMON]; @@ -901,8 +1065,6 @@ void FillDivisionCaseMaps() return L""; }; - DivisionCaseEndingModificatorMap[NounEndingDivision::DC_SPECIAL_PLURAL_FORM] = DivisionCaseEndingModificatorMap[NounEndingDivision::DC_COMMON]; - DivisionCaseNounTupleFilterMap[NounEndingDivision::DC_COMMON] = [](NounTuple t) { return true; }; @@ -923,10 +1085,6 @@ void FillDivisionCaseMaps() ((std::get<2>(t) == NGC_P4_ACCUSATIVE && !IsDeclencionAnimated(std::get<0>(t))))); }; - DivisionCaseNounTupleFilterMap[NounEndingDivision::DC_SPECIAL_PLURAL_FORM] = [](NounTuple t) - { - return (std::get<1>(t) == NC_PLURAL); - }; DivisionCaseNounTupleRecordFilterMap[NounEndingDivision::DC_COMMON] = [](NounTuple t, NounRecord r) { @@ -948,12 +1106,6 @@ void FillDivisionCaseMaps() return r.haveAlternativeMultipleFormEnding && StandardTest(t, r); }; - - DivisionCaseNounTupleRecordFilterMap[NounEndingDivision::DC_SPECIAL_PLURAL_FORM] = [](NounTuple t, NounRecord r) - { - return r.specialMultipleForm != L"" && StandardTest(t, r); - }; - } @@ -984,74 +1136,41 @@ std::vector RecognizeNoun(std::wstring noun) if (DivisionCaseNounTupleFilterMap[dc](nounTuple)) { - std::wstring nounNominaviteSingular = GetNounNoninative(modifiedNounBase, std::get<0>(nounTuple), NC_SINGULAR); - - if (NounIsInDictionary(nounNominaviteSingular)) + if (std::get<1>(nounTuple) == NC_SINGULAR) { - NounRecord nounRecord = GetNounRecordFromDictionary(nounNominaviteSingular); + std::wstring nounNominaviteSingular = GetNounNoninative(modifiedNounBase, std::get<0>(nounTuple), NC_SINGULAR); - if (DivisionCaseNounTupleRecordFilterMap[dc](nounTuple, nounRecord)) + if (NounIsInDictionary(nounNominaviteSingular)) { - result.push_back({ std::get<2>(nounTuple), std::get<1>(nounTuple), IsDeclencionAnimated(std::get<0>(nounTuple)), nounRecord }); + + NounRecord nounRecord = GetNounRecordFromDictionary(nounNominaviteSingular); + + if (DivisionCaseNounTupleRecordFilterMap[dc](nounTuple, nounRecord)) + { + result.push_back({ std::get<2>(nounTuple), std::get<1>(nounTuple), IsDeclencionAnimated(std::get<0>(nounTuple)), nounRecord }); + } } + } - - - std::wstring nounNominavitePlural = GetNounNoninative(nounBase, std::get<0>(nounTuple), NC_PLURAL); - - if (NounSpecialPluralFormIsInDictionary(nounNominavitePlural)) + else { - NounRecord nounRecord = GetNounRecordFromDictionary_BySpecialPluralForm(nounNominavitePlural); - if (DivisionCaseNounTupleRecordFilterMap[dc](nounTuple, nounRecord)) + std::wstring nounNominavitePlural = GetNounNoninative(nounBase, std::get<0>(nounTuple), NC_PLURAL); + + if (NounPluralFormIsInDictionary(nounNominavitePlural)) { - result.push_back({ std::get<2>(nounTuple), std::get<1>(nounTuple), IsDeclencionAnimated(std::get<0>(nounTuple)), nounRecord }); + NounRecord nounRecord = GetNounRecordFromDictionary_ByPluralForm(nounNominavitePlural); + + if (DivisionCaseNounTupleRecordFilterMap[dc](nounTuple, nounRecord)) + { + result.push_back({ std::get<2>(nounTuple), std::get<1>(nounTuple), IsDeclencionAnimated(std::get<0>(nounTuple)), nounRecord }); + } } } } } - - /* - //Special plural form check - for (auto nounTuple : possibleTupleArr) - { - if (std::get<1>(nounTuple) == NC_PLURAL) - { - std::wstring nounNominavitePlural = GetNounNoninative(nounBase, std::get<0>(nounTuple), NC_PLURAL); - - if (NounSpecialPluralFormIsInDictionary(nounNominavitePlural)) - { - NounRecord nounRecord = GetNounRecordFromDictionary_BySpecialPluralForm(nounNominavitePlural); - - - if (nounRecord.canBeAnimate) - { - NounStruct ns{ std::get<2>(nounTuple), std::get<1>(nounTuple), true, nounRecord }; - - if (!NounScructIsAlreadyInArray(ns, result)) - { - result.push_back(ns); - } - } - - if (nounRecord.canBeInanimate) - { - NounStruct ns{ std::get<2>(nounTuple), std::get<1>(nounTuple), false, nounRecord }; - - if (!NounScructIsAlreadyInArray(ns, result)) - { - result.push_back(ns); - } - } - } - } - - }*/ - - - } @@ -1078,6 +1197,15 @@ NounDeclencion CalculateNounDeclention(NounRecord nounRecord) } +void CalculatePluralForm() +{ + for (auto& nounRecord : NounRecordArr) + { + nounRecord.precalculatedNominativePluralSet = getPluralForm(nounRecord); + } +} + + void LoadNounDeclencionCaseTable() { diff --git a/rudict/rudict/noun.h b/rudict/rudict/noun.h index 8eecefa..a8ff107 100644 --- a/rudict/rudict/noun.h +++ b/rudict/rudict/noun.h @@ -35,6 +35,8 @@ struct NounRecord bool canBeAnimate; bool canBeInanimate; + std::set precalculatedNominativePluralSet; + NounRecord(); NounRecord(std::wstring line); }; @@ -53,6 +55,8 @@ enum NounDeclencion SECOND_MALE_UFORM_ANIMATE, SECOND_MALE_SSFORM_INANIMATE, SECOND_MALE_SSFORM_ANIMATE, + SECOND_I_SHORT_INANIMATE, + SECOND_I_SHORT_ANIMATE, SECOND_NEUTRAL_E_FORM, SECOND_NEUTRAL_O_FORM, THIRD_FORM_INANIMATE, @@ -126,8 +130,7 @@ struct NounEndingDivision DC_COMMON = 0, DC_LOST_VOWEL_O, DC_LOST_VOWEL_E, - DC_SPECIAL_PLURAL_A, - DC_SPECIAL_PLURAL_FORM + DC_SPECIAL_PLURAL_A } divisionCase; bool operator<(const NounEndingDivision& other) const @@ -152,18 +155,23 @@ struct NounEndingDivision std::vector GetAllNounEndingArr(); + +std::set getPluralForm(NounRecord nounRecord); + bool NounIsInDictionary(std::wstring nounNominative); -bool NounSpecialPluralFormIsInDictionary(std::wstring nounNominativePlural); +bool NounPluralFormIsInDictionary(std::wstring nounNominativePlural); NounRecord GetNounRecordFromDictionary(std::wstring nounNominative); -NounRecord GetNounRecordFromDictionary_BySpecialPluralForm(std::wstring nounNominativePlural); +NounRecord GetNounRecordFromDictionary_ByPluralForm(std::wstring nounNominativePlural); -bool charIsConsolant(wchar_t c); +bool charIsConsolant(wchar_t c); //except й bool charIsVowel(wchar_t c); +bool charIsMissingVowelSoftenerConsolant(wchar_t c); + struct NounStruct { NounGrammaticalCase nounGrammaticalCase; @@ -179,6 +187,12 @@ std::vector GetPossibleNounTupleArr(std::wstring nounEnding); std::wstring GetNounNoninative(std::wstring nounBase, NounDeclencion nounDeclencion, NounCount nounCount); +wchar_t GetLastChar(const NounRecord& nounRecord); +wchar_t GetPrevLastChar(const NounRecord& nounRecord); +bool charIsIFormConsolant(wchar_t c); +bool charIsUFormConsolant(wchar_t c); + + void SetupDeclentionMap(); bool NounFitsDeclention(NounRecord nounRecord, NounTuple nounTuple); @@ -192,8 +206,11 @@ std::vector RecognizeNoun(std::wstring noun); NounDeclencion CalculateNounDeclention(NounRecord nounRecord); +void CalculatePluralForm(); void LoadFrequentWordSet(); + + void LoadNounDeclencionCaseTable(); #endif //NOUN_H_INCLUDED