chinese-journal/rudict/rudict/noun.cpp

1229 lines
34 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "noun.h"
#include <iostream> //Xperimental -- for debug only
#include "utf8utf16.h"
#include "boost/regex.hpp"
#include "boost/algorithm/string/regex.hpp"
namespace NN
{
std::vector<NounRecord> NounRecordArr;
std::vector<NounDeclencionCaseTableRecord> nounDeclencionCaseTable;
NounRecord::NounRecord()
: gender(NG_MALE)
, haveSingleForm(false)
, haveMultipleForm(false)
, haveStandardMultipleForm(false)
, haveStandardMultipleFormWithMissingLastVowel(false)
, haveStandardMultipleFormEnding(false)
, haveAlternativeMultipleFormEnding(false)
, canBeAnimate(false)
, canBeInanimate(false)
{
}
NounRecord::NounRecord(std::wstring line)
{
std::vector<std::wstring> lineArr;
boost::split_regex(lineArr, line, boost::wregex(L";"));
nominativeForm = lineArr[1];
if (lineArr[2] == L"м")
{
gender = NG_MALE;
}
else if (lineArr[2] == L"ж")
{
gender = NG_FEMALE;
}
else
{
gender = NG_NEUTRAL;
}
haveSingleForm = lineArr[3] == L"1" ? true : false;
haveMultipleForm = lineArr[4] == L"1" ? true : false;
haveStandardMultipleForm = lineArr[5] == L"1" ? true : false;
haveStandardMultipleFormWithMissingLastVowel = lineArr[6] == L"1" ? true : false;
haveStandardMultipleFormEnding = lineArr[7] == L"1" ? true : false;
haveAlternativeMultipleFormEnding = lineArr[8] == L"1" ? true : false;
specialMultipleForm = lineArr[9];
canBeAnimate = lineArr[10] == L"1" ? true : false;
canBeInanimate = lineArr[11] == L"1" ? true : false;
}
NounDeclencion WStringToNounDeclencion(std::wstring str)
{
if (str == L"FIRST_A_IFORM_INANIMATE")
{
return FIRST_A_IFORM_INANIMATE;
}
if (str == L"FIRST_A_IFORM_ANIMATE")
{
return FIRST_A_IFORM_ANIMATE;
}
if (str == L"FIRST_A_UFORM_INANIMATE")
{
return FIRST_A_UFORM_INANIMATE;
}
if (str == L"FIRST_A_UFORM_ANIMATE")
{
return FIRST_A_UFORM_ANIMATE;
}
if (str == L"FIRST_YA_FORM_INANIMATE")
{
return FIRST_YA_FORM_INANIMATE;
}
if (str == L"FIRST_YA_FORM_ANIMATE")
{
return FIRST_YA_FORM_ANIMATE;
}
if (str == L"SECOND_MALE_IFORM_INANIMATE")
{
return SECOND_MALE_IFORM_INANIMATE;
}
if (str == L"SECOND_MALE_IFORM_ANIMATE")
{
return SECOND_MALE_IFORM_ANIMATE;
}
if (str == L"SECOND_MALE_UFORM_INANIMATE")
{
return SECOND_MALE_UFORM_INANIMATE;
}
if (str == L"SECOND_MALE_UFORM_ANIMATE")
{
return SECOND_MALE_UFORM_ANIMATE;
}
if (str == L"SECOND_MALE_SSFORM_INANIMATE")
{
return SECOND_MALE_SSFORM_INANIMATE;
}
if (str == L"SECOND_MALE_SSFORM_ANIMATE")
{
return SECOND_MALE_SSFORM_ANIMATE;
}
if (str == L"SECOND_I_SHORT_INANIMATE")
{
return SECOND_I_SHORT_INANIMATE;
}
if (str == L"SECOND_I_SHORT_ANIMATE")
{
return SECOND_I_SHORT_ANIMATE;
}
if (str == L"SECOND_NEUTRAL_E_FORM")
{
return SECOND_NEUTRAL_E_FORM;
}
if (str == L"SECOND_NEUTRAL_O_FORM")
{
return SECOND_NEUTRAL_O_FORM;
}
if (str == L"THIRD_FORM_INANIMATE")
{
return THIRD_FORM_INANIMATE;
}
if (str == L"THIRD_FORM_ANIMATE")
{
return THIRD_FORM_ANIMATE;
}
std::cout << "Error in WStringToNounDeclencion!" << std::endl;
return FIRST_A_IFORM_INANIMATE;
}
std::vector<std::wstring> GetAllNounEndingArr()
{
std::vector<std::wstring> result
{
L"",
L"й",
L"ев",
L"а",
L"и",
L"е",
L"у",
L"ой",
L"ы",
L"ом",
L"ь",
L"я",
L"ю",
L"ем",
L"о",
L"ью",
L"ам",
L"ами",
L"ах",
L"ов",
L"ей",
L"ям",
L"ях",
L"я",
L"ями",
};
return result;
}
bool NounIsInDictionary(std::wstring nounNominative)
{
for (auto& noun : NounRecordArr)
{
if (noun.nominativeForm == nounNominative)
{
return true;
}
}
return false;
}
std::wstring convertToStandardPluralForm(std::wstring s)
{
std::wstring pluralForm = s;
if (pluralForm[pluralForm.size() - 1] == L'а' && charIsIFormConsolant(pluralForm[pluralForm.size() - 2]))
{
pluralForm[pluralForm.size() - 1] = L'и';
}
else if (pluralForm[pluralForm.size() - 1] == L'а' && charIsUFormConsolant(pluralForm[pluralForm.size() - 2]))
{
pluralForm[pluralForm.size() - 1] = L'ы';
}
else if (pluralForm[pluralForm.size() - 1] == L'я')
{
pluralForm[pluralForm.size() - 1] = L'и';
}
else if (charIsIFormConsolant(pluralForm[pluralForm.size() - 1]))
{
pluralForm += L'и';
}
else if (charIsUFormConsolant(pluralForm[pluralForm.size() - 1]))
{
pluralForm += L'ы';
}
else if (pluralForm[pluralForm.size() - 1] == L'ь')
{
pluralForm[pluralForm.size() - 1] = L'и';
}
else if (pluralForm[pluralForm.size() - 1] == L'й')
{
pluralForm[pluralForm.size() - 1] = L'и';
}
else if (pluralForm[pluralForm.size() - 1] == L'о')
{
pluralForm[pluralForm.size() - 1] = L'а';
}
else if (pluralForm[pluralForm.size() - 1] == L'е')
{
pluralForm[pluralForm.size() - 1] = L'я';
}
else
{
std::cout << "Error in convertToStandardPluralForm" << std::endl;
}
return pluralForm;
}
std::wstring convertFromStandardToAlternativePluralForm(std::wstring s)
{
if (s[s.size() - 1] == L'и')
{
s[s.size() - 1] = L'я';
}
if (s[s.size() - 1] == L'ы')
{
s[s.size() - 1] = L'а';
}
return s;
}
std::set<std::wstring> getPluralForm(NounRecord noun)
{
std::set<std::wstring> result;
if (noun.specialMultipleForm != L"")
{
result.insert(noun.specialMultipleForm);
}
if (noun.haveSingleForm)
{
if (noun.haveStandardMultipleForm)
{
std::wstring pluralForm = convertToStandardPluralForm(noun.nominativeForm);
if (noun.haveStandardMultipleFormEnding)
{
result.insert(pluralForm);
}
if (noun.haveAlternativeMultipleFormEnding)
{
result.insert(convertFromStandardToAlternativePluralForm(pluralForm));
}
}
if (noun.haveStandardMultipleFormWithMissingLastVowel)
{
std::wstring pluralForm = convertToStandardPluralForm(noun.nominativeForm);
wchar_t prevsschar = pluralForm[pluralForm.size() - 4];
if (charIsMissingVowelSoftenerConsolant(prevsschar))
{
pluralForm[pluralForm.size() - 3] = L'ь';
}
else
{
pluralForm.erase(pluralForm.begin() + pluralForm.size() - 3);
}
if (noun.haveStandardMultipleFormEnding)
{
result.insert(pluralForm);
}
if (noun.haveAlternativeMultipleFormEnding)
{
result.insert(convertFromStandardToAlternativePluralForm(pluralForm));
}
}
}
else
{
result.insert(noun.nominativeForm);
}
return result;
}
bool NounPluralFormIsInDictionary(std::wstring nounNominativePlural)
{
for (auto& noun : NounRecordArr)
{
if (noun.haveMultipleForm)
{
if (noun.precalculatedNominativePluralSet.count(nounNominativePlural) != 0)
{
return true;
}
}
}
return false;
}
NounRecord GetNounRecordFromDictionary(std::wstring nounNominative)
{
for (auto& noun : NounRecordArr)
{
if (noun.nominativeForm == nounNominative)
{
return noun;
}
}
return{};
}
NounRecord GetNounRecordFromDictionary_ByPluralForm(std::wstring nounNominativePlural)
{
for (auto& noun : NounRecordArr)
{
if (noun.haveMultipleForm)
{
if (noun.precalculatedNominativePluralSet.count(nounNominativePlural) != 0)
{
return noun;
}
}
}
return{};
}
bool charIsMissingVowelSoftenerConsolant(wchar_t c)
{
//This test belongs to missing vowel case.
//лев -> львы (because л, then е replaced by soft sign)
//немец -> немцы (because not л, the е is not replaced, just missing)
std::wstring consolants = L"л";
for (wchar_t ic : consolants)
{
if (c == ic)
{
return true;
}
}
return false;
}
std::set<NounEndingDivision> getPossibleNounEndingDivisionSet(std::wstring noun)
{
std::set<NounEndingDivision> result;
auto allNounEndingArr = GetAllNounEndingArr();
for (auto ending : allNounEndingArr)
{
if (boost::ends_with(noun, ending))
{
std::wstring nounBase = boost::replace_last_copy(noun, ending, "");
if ((charIsVowel(nounBase[nounBase.size() - 1])) || //Might be exact the й case
(charIsConsolant(nounBase[nounBase.size() - 1]) || nounBase[nounBase.size() - 1] == L'ь' || nounBase[nounBase.size() - 1] == L'ъ'))
{
result.insert({ nounBase, ending, NounEndingDivision::DC_COMMON });
}
//Check missed vowel (simple case)
if (charIsConsolant(nounBase[nounBase.size() - 1]) && charIsConsolant(nounBase[nounBase.size() - 2]))
{
result.insert({ nounBase, ending, NounEndingDivision::DC_LOST_VOWEL_O });
}
if (charIsConsolant(nounBase[nounBase.size() - 1]) && nounBase[nounBase.size() - 2] == L'ь' && charIsMissingVowelSoftenerConsolant(nounBase[nounBase.size() - 3]))
{
result.insert({ nounBase, ending, NounEndingDivision::DC_LOST_VOWEL_E });
}
if (charIsConsolant(nounBase[nounBase.size() - 1]) && charIsConsolant(nounBase[nounBase.size() - 2]) && !charIsMissingVowelSoftenerConsolant(nounBase[nounBase.size() - 2]))
{
result.insert({ nounBase, ending, NounEndingDivision::DC_LOST_VOWEL_E });
}
}
}
return result;
}
std::vector<NounTuple> GetPossibleNounTupleArr(std::wstring nounEnding)
{
std::vector<NounTuple> result;
for (auto& noun : nounDeclencionCaseTable)
{
for (int i = 0; i < NGC_SIZE * NC_SIZE; i++)
{
if (noun.grammaticalCaseTable[i].ending.count(nounEnding) != 0)
{
result.push_back(NounTuple{ noun.nounDeclencion, noun.grammaticalCaseTable[i].count, noun.grammaticalCaseTable[i].grammaticalCase });
}
}
}
return result;
}
std::vector<NounTuple> FilterNounTupleArrByNounDeclentionSet(std::vector<NounTuple> nounTupleArr, std::set<NounDeclencion> filter)
{
std::vector<NounTuple> result;
for (auto nounTuple : nounTupleArr)
{
if (filter.count(std::get<0>(nounTuple)) != 0)
{
result.push_back(nounTuple);
}
}
return result;
}
std::set<std::wstring> GetNounNoninative(std::wstring nounBase, NounDeclencion nounDeclencion, NounCount nounCount)
{
std::set<std::wstring> result;
NounDeclencionCaseTableRecord nounDeclencionCaseTableRecord = nounDeclencionCaseTable[static_cast<int>(nounDeclencion)];
for (auto& grammaticalTableRecord : nounDeclencionCaseTableRecord.grammaticalCaseTable)
{
if (grammaticalTableRecord.grammaticalCase == NGC_P1_NOMINATIVE && grammaticalTableRecord.count == nounCount)
{
for (auto& e : grammaticalTableRecord.ending)
{
result.insert(nounBase + e);
}
}
}
return result;
}
std::wstring GetNounNoninativeSpecialPluralA(std::wstring nounBase, NounDeclencion nounDeclencion)
{
if (charIsIFormConsolant(nounBase[nounBase.size() - 1]))
{
return nounBase + L"я";
}
if (charIsUFormConsolant(nounBase[nounBase.size() - 1]))
{
return nounBase + L"а";
}
if (charIsVowel(nounBase[nounBase.size() - 1]))
{
return nounBase + L"я";
}
std::cout << "Error in GetNounNoninative" << std::endl;
return L"";
}
wchar_t GetLastChar(const NounRecord& nounRecord)
{
return nounRecord.nominativeForm[nounRecord.nominativeForm.size() - 1];
}
wchar_t GetPrevLastChar(const NounRecord& nounRecord)
{
return nounRecord.nominativeForm[nounRecord.nominativeForm.size() - 2];
}
bool FirstAIFormInanimateSingularCondition(const NounRecord& nounRecord)
{
return nounRecord.haveSingleForm && nounRecord.canBeInanimate && (GetLastChar(nounRecord) == L'а' && charIsIFormConsolant(GetPrevLastChar(nounRecord)));
}
bool FirstAIFormAnimateSingularCondition(const NounRecord& nounRecord)
{
return nounRecord.haveSingleForm && nounRecord.canBeAnimate && (GetLastChar(nounRecord) == L'а' && charIsIFormConsolant(GetPrevLastChar(nounRecord)));
}
bool FirstAIFormInanimatePluralCondition(const NounRecord& nounRecord)
{
return nounRecord.haveMultipleForm && nounRecord.canBeInanimate && (
(GetLastChar(nounRecord) == L'а' && charIsIFormConsolant(GetPrevLastChar(nounRecord))) ||
!nounRecord.haveSingleForm && (GetLastChar(nounRecord) == L'и' && charIsIFormConsolant(GetPrevLastChar(nounRecord)))
);
}
bool FirstAIFormAnimatePluralCondition(const NounRecord& nounRecord)
{
return nounRecord.haveMultipleForm && nounRecord.canBeAnimate && (
(GetLastChar(nounRecord) == L'а' && charIsIFormConsolant(GetPrevLastChar(nounRecord))) ||
!nounRecord.haveSingleForm && (GetLastChar(nounRecord) == L'и' && charIsIFormConsolant(GetPrevLastChar(nounRecord)))
);
}
bool FirstAUFormInanimateSingularCondition(const NounRecord& nounRecord)
{
return nounRecord.haveSingleForm && nounRecord.canBeInanimate && (GetLastChar(nounRecord) == L'а' && charIsUFormConsolant(GetPrevLastChar(nounRecord)));
}
bool FirstAUFormAnimateSingularCondition(const NounRecord& nounRecord)
{
return nounRecord.haveSingleForm && nounRecord.canBeAnimate && (GetLastChar(nounRecord) == L'а' && charIsUFormConsolant(GetPrevLastChar(nounRecord)));
}
bool FirstAUFormInanimatePluralCondition(const NounRecord& nounRecord)
{
return nounRecord.haveMultipleForm && nounRecord.canBeInanimate && (
(GetLastChar(nounRecord) == L'а' && charIsUFormConsolant(GetPrevLastChar(nounRecord))) ||
!nounRecord.haveSingleForm && (GetLastChar(nounRecord) == L'ы' && charIsUFormConsolant(GetPrevLastChar(nounRecord)))
);
}
bool FirstAUFormAnimatePluralCondition(const NounRecord& nounRecord)
{
return nounRecord.haveMultipleForm && nounRecord.canBeAnimate && (
(GetLastChar(nounRecord) == L'а' && charIsUFormConsolant(GetPrevLastChar(nounRecord))) ||
!nounRecord.haveSingleForm && (GetLastChar(nounRecord) == L'ы' && charIsUFormConsolant(GetPrevLastChar(nounRecord)))
);
}
bool FirstYaFormInanimateSingularCondition(const NounRecord& nounRecord)
{
return nounRecord.haveSingleForm && nounRecord.canBeInanimate && (GetLastChar(nounRecord) == L'я');
}
bool FirstYaFormAnimateSingularCondition(const NounRecord& nounRecord)
{
return nounRecord.haveSingleForm && nounRecord.canBeAnimate && (GetLastChar(nounRecord) == L'я');
}
bool FirstYaFormInanimatePluralCondition(const NounRecord& nounRecord)
{
return nounRecord.haveMultipleForm && nounRecord.canBeInanimate && (
(GetLastChar(nounRecord) == L'я') ||
!nounRecord.haveSingleForm && (GetLastChar(nounRecord) == L'и')
);
}
bool FirstYaFormAnimatePluralCondition(const NounRecord& nounRecord)
{
return nounRecord.haveMultipleForm && nounRecord.canBeAnimate && (
(GetLastChar(nounRecord) == L'я') ||
!nounRecord.haveSingleForm && (GetLastChar(nounRecord) == L'и')
);
}
// Second declention
bool SecondMaleIFormInanimateSingularCondition(const NounRecord& nounRecord)
{
return nounRecord.haveSingleForm && nounRecord.gender == NG_MALE && nounRecord.canBeInanimate && charIsIFormConsolant(GetLastChar(nounRecord));
}
bool SecondMaleIFormInanimatePluralCondition(const NounRecord& nounRecord)
{
return nounRecord.haveMultipleForm && nounRecord.gender == NG_MALE && nounRecord.canBeInanimate && (
charIsIFormConsolant(GetLastChar(nounRecord)) ||
!nounRecord.haveSingleForm && charIsIFormConsolant(GetPrevLastChar(nounRecord)) && GetLastChar(nounRecord) == L'и'
);
}
bool SecondMaleIFormAnimateSingularCondition(const NounRecord& nounRecord)
{
return nounRecord.haveSingleForm && nounRecord.gender == NG_MALE && nounRecord.canBeAnimate && charIsIFormConsolant(GetLastChar(nounRecord));
}
bool SecondMaleIFormAnimatePluralCondition(const NounRecord& nounRecord)
{
return nounRecord.haveMultipleForm && nounRecord.gender == NG_MALE && nounRecord.canBeAnimate && (
charIsIFormConsolant(GetLastChar(nounRecord)) ||
!nounRecord.haveSingleForm && charIsIFormConsolant(GetPrevLastChar(nounRecord)) && GetLastChar(nounRecord) == L'и'
);
}
bool SecondMaleUFormInanimateSingularCondition(const NounRecord& nounRecord)
{
return nounRecord.haveSingleForm && nounRecord.gender == NG_MALE && nounRecord.canBeInanimate && charIsUFormConsolant(GetLastChar(nounRecord));
}
bool SecondMaleUFormInanimatePluralCondition(const NounRecord& nounRecord)
{
return nounRecord.haveMultipleForm && nounRecord.gender == NG_MALE && nounRecord.canBeInanimate && (
charIsUFormConsolant(GetLastChar(nounRecord)) ||
!nounRecord.haveSingleForm && charIsUFormConsolant(GetPrevLastChar(nounRecord)) && GetLastChar(nounRecord) == L'ы'
);
}
bool SecondMaleUFormAnimateSingularCondition(const NounRecord& nounRecord)
{
return nounRecord.haveSingleForm && nounRecord.gender == NG_MALE && nounRecord.canBeAnimate && charIsUFormConsolant(GetLastChar(nounRecord));
}
bool SecondMaleUFormAnimatePluralCondition(const NounRecord& nounRecord)
{
return nounRecord.haveMultipleForm && nounRecord.gender == NG_MALE && nounRecord.canBeAnimate && (
charIsUFormConsolant(GetLastChar(nounRecord)) ||
!nounRecord.haveSingleForm && charIsUFormConsolant(GetPrevLastChar(nounRecord)) && GetLastChar(nounRecord) == L'ы'
);
}
bool SecondMaleSSFormInanimateSingularCondition(const NounRecord& nounRecord)
{
return nounRecord.haveSingleForm && nounRecord.gender == NG_MALE && nounRecord.canBeInanimate && GetLastChar(nounRecord) == L'ь';
}
bool SecondMaleSSFormInanimatePluralCondition(const NounRecord& nounRecord)
{
return nounRecord.haveMultipleForm && nounRecord.gender == NG_MALE && nounRecord.canBeInanimate && (
GetLastChar(nounRecord) == L'ь' ||
!nounRecord.haveSingleForm && GetLastChar(nounRecord) == L'и'
);
}
bool SecondMaleSSFormAnimateSingularCondition(const NounRecord& nounRecord)
{
return nounRecord.haveSingleForm && nounRecord.gender == NG_MALE && nounRecord.canBeAnimate && GetLastChar(nounRecord) == L'ь';
}
bool SecondMaleSSFormAnimatePluralCondition(const NounRecord& nounRecord)
{
return nounRecord.haveMultipleForm && nounRecord.gender == NG_MALE && nounRecord.canBeAnimate && (
GetLastChar(nounRecord) == L'ь' ||
!nounRecord.haveSingleForm && GetLastChar(nounRecord) == L'и'
);
}
bool SecondIShortInanimateSingularCondition(const NounRecord& nounRecord)
{
return nounRecord.haveSingleForm && nounRecord.canBeInanimate && GetLastChar(nounRecord) == L'й';
}
bool SecondIShortAnimateSingularCondition(const NounRecord& nounRecord)
{
return nounRecord.haveSingleForm && nounRecord.canBeAnimate && GetLastChar(nounRecord) == L'й';
}
bool SecondIShortInanimatePluralCondition(const NounRecord& nounRecord)
{
return nounRecord.haveSingleForm && nounRecord.canBeInanimate && GetLastChar(nounRecord) == L'й';
}
bool SecondIShortAnimatePluralCondition(const NounRecord& nounRecord)
{
return nounRecord.haveSingleForm && nounRecord.canBeAnimate && GetLastChar(nounRecord) == L'й';
}
bool SecondNeutralEFormSingularCondition(const NounRecord& nounRecord)
{
return nounRecord.haveSingleForm && nounRecord.gender == NG_NEUTRAL && GetLastChar(nounRecord) == L'е';
}
bool SecondNeutralEFormPluralCondition(const NounRecord& nounRecord)
{
return nounRecord.haveMultipleForm && nounRecord.gender == NG_NEUTRAL && (
GetLastChar(nounRecord) == L'е' ||
!nounRecord.haveSingleForm && GetLastChar(nounRecord) == L'я'
);
}
bool SecondNeutralOFormSingularCondition(const NounRecord& nounRecord)
{
return nounRecord.haveSingleForm && nounRecord.gender == NG_NEUTRAL && GetLastChar(nounRecord) == L'о';
}
bool SecondNeutralOFormPluralCondition(const NounRecord& nounRecord)
{
return nounRecord.haveMultipleForm && nounRecord.gender == NG_NEUTRAL && (
GetLastChar(nounRecord) == L'о' ||
!nounRecord.haveSingleForm && GetLastChar(nounRecord) == L'а'
);
}
bool ThirdFormInanimateSingularCondition(const NounRecord& nounRecord)
{
return nounRecord.haveSingleForm && nounRecord.gender == NG_FEMALE && nounRecord.canBeInanimate && GetLastChar(nounRecord) == L'ь';
}
bool ThirdFormAnimateSingularCondition(const NounRecord& nounRecord)
{
return nounRecord.haveSingleForm && nounRecord.gender == NG_FEMALE && nounRecord.canBeAnimate && GetLastChar(nounRecord) == L'ь';
}
bool ThirdFormInanimatePluralCondition(const NounRecord& nounRecord)
{
return nounRecord.haveMultipleForm && nounRecord.gender == NG_FEMALE && nounRecord.canBeInanimate && (
GetLastChar(nounRecord) == L'ь' ||
!nounRecord.haveSingleForm && GetLastChar(nounRecord) == L'и'
);
}
bool ThirdFormAnimatePluralCondition(const NounRecord& nounRecord)
{
return nounRecord.haveMultipleForm && nounRecord.gender == NG_FEMALE && nounRecord.canBeAnimate && (
GetLastChar(nounRecord) == L'ь' ||
!nounRecord.haveSingleForm && GetLastChar(nounRecord) == L'и'
);
}
std::map<std::pair<NounDeclencion, NounCount>, std::function < bool(const NounRecord&) >> DeclentionConditionMap;
void SetupDeclentionMap()
{
DeclentionConditionMap[{FIRST_A_IFORM_INANIMATE, NC_SINGULAR}] = std::bind(FirstAIFormInanimateSingularCondition, std::placeholders::_1);
DeclentionConditionMap[{FIRST_A_IFORM_ANIMATE, NC_SINGULAR}] = std::bind(FirstAIFormAnimateSingularCondition, std::placeholders::_1);
DeclentionConditionMap[{FIRST_A_IFORM_INANIMATE, NC_PLURAL}] = std::bind(FirstAIFormInanimatePluralCondition, std::placeholders::_1);
DeclentionConditionMap[{FIRST_A_IFORM_ANIMATE, NC_PLURAL}] = std::bind(FirstAIFormAnimatePluralCondition, std::placeholders::_1);
DeclentionConditionMap[{FIRST_A_UFORM_INANIMATE, NC_SINGULAR}] = std::bind(FirstAUFormInanimateSingularCondition, std::placeholders::_1);
DeclentionConditionMap[{FIRST_A_UFORM_ANIMATE, NC_SINGULAR}] = std::bind(FirstAUFormAnimateSingularCondition, std::placeholders::_1);
DeclentionConditionMap[{FIRST_A_UFORM_INANIMATE, NC_PLURAL}] = std::bind(FirstAUFormInanimatePluralCondition, std::placeholders::_1);
DeclentionConditionMap[{FIRST_A_UFORM_ANIMATE, NC_PLURAL}] = std::bind(FirstAUFormAnimatePluralCondition, std::placeholders::_1);
DeclentionConditionMap[{FIRST_YA_FORM_INANIMATE, NC_SINGULAR}] = std::bind(FirstYaFormInanimateSingularCondition, std::placeholders::_1);
DeclentionConditionMap[{FIRST_YA_FORM_ANIMATE, NC_SINGULAR}] = std::bind(FirstYaFormAnimateSingularCondition, std::placeholders::_1);
DeclentionConditionMap[{FIRST_YA_FORM_INANIMATE, NC_PLURAL}] = std::bind(FirstYaFormInanimatePluralCondition, std::placeholders::_1);
DeclentionConditionMap[{FIRST_YA_FORM_ANIMATE, NC_PLURAL}] = std::bind(FirstYaFormAnimatePluralCondition, std::placeholders::_1);
//Second form
DeclentionConditionMap[{SECOND_MALE_IFORM_INANIMATE, NC_SINGULAR}] = std::bind(SecondMaleIFormInanimateSingularCondition, std::placeholders::_1);
DeclentionConditionMap[{SECOND_MALE_IFORM_INANIMATE, NC_PLURAL}] = std::bind(SecondMaleIFormInanimatePluralCondition, std::placeholders::_1);
DeclentionConditionMap[{SECOND_MALE_IFORM_ANIMATE, NC_SINGULAR}] = std::bind(SecondMaleIFormAnimateSingularCondition, std::placeholders::_1);
DeclentionConditionMap[{SECOND_MALE_IFORM_ANIMATE, NC_PLURAL}] = std::bind(SecondMaleIFormAnimatePluralCondition, std::placeholders::_1);
DeclentionConditionMap[{SECOND_MALE_UFORM_INANIMATE, NC_SINGULAR}] = std::bind(SecondMaleUFormInanimateSingularCondition, std::placeholders::_1);
DeclentionConditionMap[{SECOND_MALE_UFORM_INANIMATE, NC_PLURAL}] = std::bind(SecondMaleUFormInanimatePluralCondition, std::placeholders::_1);
DeclentionConditionMap[{SECOND_MALE_UFORM_ANIMATE, NC_SINGULAR}] = std::bind(SecondMaleUFormAnimateSingularCondition, std::placeholders::_1);
DeclentionConditionMap[{SECOND_MALE_UFORM_ANIMATE, NC_PLURAL}] = std::bind(SecondMaleUFormAnimatePluralCondition, std::placeholders::_1);
DeclentionConditionMap[{SECOND_MALE_SSFORM_INANIMATE, NC_SINGULAR}] = std::bind(SecondMaleSSFormInanimateSingularCondition, std::placeholders::_1);
DeclentionConditionMap[{SECOND_MALE_SSFORM_INANIMATE, NC_PLURAL}] = std::bind(SecondMaleSSFormInanimatePluralCondition, std::placeholders::_1);
DeclentionConditionMap[{SECOND_MALE_SSFORM_ANIMATE, NC_SINGULAR}] = std::bind(SecondMaleSSFormAnimateSingularCondition, std::placeholders::_1);
DeclentionConditionMap[{SECOND_MALE_SSFORM_ANIMATE, NC_PLURAL}] = std::bind(SecondMaleSSFormAnimatePluralCondition, std::placeholders::_1);
DeclentionConditionMap[{SECOND_I_SHORT_INANIMATE, NC_SINGULAR}] = std::bind(SecondIShortInanimateSingularCondition, std::placeholders::_1);
DeclentionConditionMap[{SECOND_I_SHORT_INANIMATE, NC_PLURAL}] = std::bind(SecondIShortInanimatePluralCondition, std::placeholders::_1);
DeclentionConditionMap[{SECOND_I_SHORT_ANIMATE, NC_SINGULAR}] = std::bind(SecondIShortAnimateSingularCondition, std::placeholders::_1);
DeclentionConditionMap[{SECOND_I_SHORT_ANIMATE, NC_PLURAL}] = std::bind(SecondIShortAnimatePluralCondition, std::placeholders::_1);
DeclentionConditionMap[{SECOND_NEUTRAL_E_FORM, NC_SINGULAR}] = std::bind(SecondNeutralEFormSingularCondition, std::placeholders::_1);
DeclentionConditionMap[{SECOND_NEUTRAL_E_FORM, NC_PLURAL}] = std::bind(SecondNeutralEFormPluralCondition, std::placeholders::_1);
DeclentionConditionMap[{SECOND_NEUTRAL_O_FORM, NC_SINGULAR}] = std::bind(SecondNeutralOFormSingularCondition, std::placeholders::_1);
DeclentionConditionMap[{SECOND_NEUTRAL_O_FORM, NC_PLURAL}] = std::bind(SecondNeutralOFormPluralCondition, std::placeholders::_1);
DeclentionConditionMap[{THIRD_FORM_INANIMATE, NC_SINGULAR}] = std::bind(ThirdFormInanimateSingularCondition, std::placeholders::_1);
DeclentionConditionMap[{THIRD_FORM_ANIMATE, NC_SINGULAR}] = std::bind(ThirdFormAnimateSingularCondition, std::placeholders::_1);
DeclentionConditionMap[{THIRD_FORM_INANIMATE, NC_PLURAL}] = std::bind(ThirdFormInanimatePluralCondition, std::placeholders::_1);
DeclentionConditionMap[{THIRD_FORM_ANIMATE, NC_PLURAL}] = std::bind(ThirdFormAnimatePluralCondition, std::placeholders::_1);
}
bool NounFitsDeclention(NounRecord nounRecord, NounTuple nounTuple)
{
NounDeclencion nounDeclencion = std::get<0>(nounTuple);
NounCount nounCount = std::get<1>(nounTuple);
bool standardDeclention = DeclentionConditionMap[{nounDeclencion, nounCount}](nounRecord);
if (standardDeclention)
{
return true;
}
return false;
}
bool NounScructIsAlreadyInArray(const NounStruct& nounStruct, const std::vector<NounStruct>& arr)
{
for (auto& ns : arr)
{
if (ns.nounGrammaticalCase == nounStruct.nounGrammaticalCase &&
ns.nounRecord.nominativeForm == nounStruct.nounRecord.nominativeForm)
{
return true;
}
}
return false;
}
bool IsDeclencionSecondType(NounDeclencion nounDeclention)
{
switch (nounDeclention)
{
case SECOND_MALE_IFORM_INANIMATE:
case SECOND_MALE_IFORM_ANIMATE:
case SECOND_MALE_UFORM_INANIMATE:
case SECOND_MALE_UFORM_ANIMATE:
case SECOND_MALE_SSFORM_INANIMATE:
case SECOND_MALE_SSFORM_ANIMATE:
case SECOND_I_SHORT_INANIMATE:
case SECOND_I_SHORT_ANIMATE:
return true;
break;
default:
return false;
break;
}
}
bool IsDeclencionAnimated(NounDeclencion nounDeclention)
{
switch (nounDeclention)
{
case FIRST_A_IFORM_ANIMATE:
case FIRST_A_UFORM_ANIMATE:
case FIRST_YA_FORM_ANIMATE:
case SECOND_MALE_IFORM_ANIMATE:
case SECOND_MALE_UFORM_ANIMATE:
case SECOND_MALE_SSFORM_ANIMATE:
case SECOND_I_SHORT_ANIMATE:
case THIRD_FORM_ANIMATE:
return true;
break;
default:
return false;
break;
}
}
bool StandardTest(NounTuple nounTuple, NounRecord nounRecord)
{
return NounFitsDeclention(nounRecord, nounTuple);
}
bool LostVowelOTest(NounTuple nounTuple, NounRecord nounRecord)
{
return nounRecord.haveStandardMultipleFormWithMissingLastVowel;
}
bool LostVowelETest(NounTuple nounTuple, NounRecord nounRecord)
{
return nounRecord.haveStandardMultipleFormWithMissingLastVowel;
}
std::map<NounEndingDivision::DivisionCase, std::function<std::wstring(std::wstring)>> DivisionCaseNounModificatorMap;
std::map<NounEndingDivision::DivisionCase, std::function<std::wstring(std::wstring)>> DivisionCaseEndingModificatorMap;
std::map < NounEndingDivision::DivisionCase, std::function < bool(NounTuple) >> DivisionCaseNounTupleFilterMap;
std::map < NounEndingDivision::DivisionCase, std::function < bool(NounTuple, NounRecord) >> DivisionCaseNounTupleRecordFilterMap;
void FillDivisionCaseMaps()
{
DivisionCaseNounModificatorMap[NounEndingDivision::DC_COMMON] = [](std::wstring s){ return s; };
DivisionCaseNounModificatorMap[NounEndingDivision::DC_LOST_VOWEL_O] = [](std::wstring s)
{
return std::wstring(s.begin(), s.end() - 2) + L"o" + s[s.size() - 1];
};
DivisionCaseNounModificatorMap[NounEndingDivision::DC_LOST_VOWEL_E] = [](std::wstring s)
{
if (s[s.size() - 2] == L'ь')
{
return std::wstring(s.begin(), s.end() - 2) + L"е" + s[s.size() - 1];
}
else
{
return std::wstring(s.begin(), s.end() - 1) + L"е" + s[s.size() - 1];
}
};
DivisionCaseEndingModificatorMap[NounEndingDivision::DC_COMMON] = [](std::wstring s){ return s; };
DivisionCaseEndingModificatorMap[NounEndingDivision::DC_LOST_VOWEL_O] = DivisionCaseEndingModificatorMap[NounEndingDivision::DC_COMMON];
DivisionCaseEndingModificatorMap[NounEndingDivision::DC_LOST_VOWEL_E] = DivisionCaseEndingModificatorMap[NounEndingDivision::DC_COMMON];
DivisionCaseNounTupleFilterMap[NounEndingDivision::DC_COMMON] = [](NounTuple t) { return true; };
DivisionCaseNounTupleFilterMap[NounEndingDivision::DC_LOST_VOWEL_O] = [](NounTuple t)
{
return (std::get<1>(t) == NC_PLURAL) ||
((std::get<2>(t) != NGC_P1_NOMINATIVE) &&
(!(std::get<2>(t) == NGC_P4_ACCUSATIVE && !IsDeclencionAnimated(std::get<0>(t)))));
};
DivisionCaseNounTupleFilterMap[NounEndingDivision::DC_LOST_VOWEL_E] = DivisionCaseNounTupleFilterMap[NounEndingDivision::DC_LOST_VOWEL_O];
DivisionCaseNounTupleRecordFilterMap[NounEndingDivision::DC_COMMON] = [](NounTuple t, NounRecord r)
{
return (r.haveStandardMultipleForm || std::get<1>(t) == NC_SINGULAR) && StandardTest(t, r);
};
DivisionCaseNounTupleRecordFilterMap[NounEndingDivision::DC_LOST_VOWEL_O] = [](NounTuple t, NounRecord r)
{
return LostVowelOTest(t, r) && StandardTest(t, r);
};
DivisionCaseNounTupleRecordFilterMap[NounEndingDivision::DC_LOST_VOWEL_E] = [](NounTuple t, NounRecord r)
{
return LostVowelETest(t, r) && StandardTest(t, r);
};
}
std::set<NounStruct> RecognizeNoun(std::wstring noun)
{
std::set<NounStruct> result;
auto nounEndingDivisionArr = getPossibleNounEndingDivisionSet(noun);
for (auto nounEndingDivision : nounEndingDivisionArr)
{
std::wstring nounBase = nounEndingDivision.base;
std::wstring nounEnding = nounEndingDivision.ending;
NounEndingDivision::DivisionCase dc = nounEndingDivision.divisionCase;
std::wstring modifiedNounBase = DivisionCaseNounModificatorMap[dc](nounBase);
std::wstring modifiedNounEnding = DivisionCaseEndingModificatorMap[dc](nounEnding);
std::vector<NounTuple> possibleTupleArr = GetPossibleNounTupleArr(modifiedNounEnding);
//Standard check
for (NounTuple nounTuple : possibleTupleArr)
{
if (DivisionCaseNounTupleFilterMap[dc](nounTuple))
{
if (std::get<1>(nounTuple) == NC_SINGULAR)
{
std::set<std::wstring> nounNominaviteSingularSet = GetNounNoninative(modifiedNounBase, std::get<0>(nounTuple), NC_SINGULAR);
for (auto& nn : nounNominaviteSingularSet)
{
if (NounIsInDictionary(nn))
{
NounRecord nounRecord = GetNounRecordFromDictionary(nn);
if (DivisionCaseNounTupleRecordFilterMap[dc](nounTuple, nounRecord))
{
result.insert({ std::get<2>(nounTuple), std::get<1>(nounTuple), IsDeclencionAnimated(std::get<0>(nounTuple)), nounRecord });
}
}
}
}
else
{
std::set<std::wstring> nounNominavitePluralSet = GetNounNoninative(nounBase, std::get<0>(nounTuple), NC_PLURAL);
//Check all plural forms
for (auto& nn : nounNominavitePluralSet)
{
if (NounPluralFormIsInDictionary(nn))
{
NounRecord nounRecord = GetNounRecordFromDictionary_ByPluralForm(nn);
if (DivisionCaseNounTupleRecordFilterMap[dc](nounTuple, nounRecord))
{
result.insert({ std::get<2>(nounTuple), std::get<1>(nounTuple), IsDeclencionAnimated(std::get<0>(nounTuple)), nounRecord });
}
}
}
}
}
}
}
return result;
}
NounDeclencion CalculateNounDeclention(NounRecord nounRecord)
{
//Xperimental -- need to find if here might be more than 1 declention
for (auto& i : DeclentionConditionMap)
{
if (i.second(nounRecord))
{
return i.first.first;
}
}
std::cout << "Error in CalculateNounDeclention" << std::endl;
return{};
}
void CalculatePluralForm()
{
for (auto& nounRecord : NounRecordArr)
{
nounRecord.precalculatedNominativePluralSet = getPluralForm(nounRecord);
}
}
void LoadNounDeclencionCaseTable()
{
nounDeclencionCaseTable.clear();
#ifdef _WIN32
std::ifstream f("C:/Workplace/ChineseJournal/rudict/grammar_case.csv");
#else
std::ifstream f("/home/devuser/workplace/rudict/grammar_case.csv");
#endif
std::string line;
std::wstring wline;
if (f.is_open())
{
std::cout << "File found!" << std::endl;
std::vector<GrammaticalTableRecord> currentGrammaticalCaseTable;
std::wstring currentNounDeclencion;
std::wstring currentNounCount;
getline(f, line); //Skip one line
while (getline(f, line))
{
std::vector<std::string> lineArr;
boost::split_regex(lineArr, line, boost::regex(";"));
if (lineArr[0] != "")
{
if (currentNounDeclencion == L"")
{
currentNounDeclencion = string_to_wstring(lineArr[0]);
}
else
{
nounDeclencionCaseTable.push_back(NounDeclencionCaseTableRecord{ WStringToNounDeclencion(currentNounDeclencion), currentGrammaticalCaseTable });
currentNounDeclencion = string_to_wstring(lineArr[0]);
currentGrammaticalCaseTable.clear();
}
}
if (lineArr[1] != "")
{
currentNounCount = string_to_wstring(lineArr[1]);
}
std::wstring endings = string_to_wstring(lineArr[3]);
std::set<std::wstring> endingsSet;
boost::split_regex(endingsSet, endings, boost::regex(", "));
currentGrammaticalCaseTable.push_back({
WStringToNounCount(currentNounCount),
WStringToNounGrammaticalCase(string_to_wstring(lineArr[2])),
endingsSet
});
}
//Add last one
if (currentNounDeclencion != L"")
{
nounDeclencionCaseTable.push_back(NounDeclencionCaseTableRecord{ WStringToNounDeclencion(currentNounDeclencion), currentGrammaticalCaseTable });
}
f.close();
}
else
{
std::cout << "file not found!" << std::endl;
}
}
void LoadFrequentWordSet()
{
#ifdef _WIN32
std::ifstream f("C:/Workplace/ChineseJournal/rudict/frequent_nouns_2000.csv");
#else
std::ifstream f("/home/devuser/workplace/rudict/frequent_nouns_2000.csv");
#endif
std::string line;
std::wstring wline;
if (f.is_open())
{
getline(f, line); //Skip one line
std::cout << "File found!" << std::endl;
while (getline(f, line))
{
wline = string_to_wstring(line);
NounRecord nounRecord(wline);
NounRecordArr.push_back(nounRecord);
}
f.close();
}
else
{
std::cout << "file not found!" << std::endl;
}
}
} //namespace NN