chinese-journal/rudict/rudict/noun.cpp
2014-11-27 09:04:08 +00:00

445 lines
14 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#include "noun.h"
#include <iostream> //Xperimental -- for debug only
#include "utf8utf16.h"
std::wstring NounDeclencionToWString(NounDeclencion nounDeclencion)
{
switch (nounDeclencion)
{
case ND_1_HARD: return L"First declencion (hard type), female";
case ND_1_SOFT: return L"First declencion (soft type), female";
case ND_2_HARD_MALE: return L"Second declencion (hard type), male";
case ND_2_SOFT_MALE: return L"Second declencion (soft type), male";
case ND_2_NEUTER_O: return L"Second declencion, E-ending, neuter";
case ND_2_NEUTER_E: return L"Second declencion, O-ending, neuter";
case ND_3: return L"Third declencion, female";
}
return L"";
}
std::wstring NounGrammaticalCaseToWString(NounGrammaticalCase nounGrammaticalCase)
{
switch (nounGrammaticalCase)
{
case NGC_P1_NOMINATIVE: return L"Nominative case";
case NGC_P2_GENITIVE: return L"Genitive case";
case NGC_P3_DATIVE: return L"Dative case";
case NGC_P4_ACCUSATIVE: return L"Accusative case";
case NGC_P5_INSTRUMENTAL: return L"Instrumental case";
case NGC_P6_PREPOSITIONAL: return L"Prepositional case";
}
return L"";
}
std::wstring NounNumberToWString(NounNumber nounNumber)
{
switch (nounNumber)
{
case NPF_SINGULAR: return L"Singular form";
case NPF_PLURAL: return L"Plural form";
}
return L"";
}
std::set<std::wstring> frequentWordSet;
std::vector<std::wstring> GetAllNounEndingArr()
{
std::vector<std::wstring> result
{
L"",
L"а",
L"и",
L"е",
L"у",
L"ой",
L"ы",
L"ом",
L"ь",
L"я",
L"ю",
L"ем",
L"о",
L"ью",
L"ам",
L"ами",
L"ах",
L"ов",
L"ей",
L"ям",
L"ях",
L"я",
L"ями",
};
return result;
}
std::map<NounTuple, StringSet> getNounEndingTable()
{
std::map<NounTuple, StringSet> result;
//Singular
result[NounTuple{ ND_1_SOFT, NGC_P1_NOMINATIVE, NPF_SINGULAR }] = StringSet{ L"а" };
result[NounTuple{ ND_1_SOFT, NGC_P2_GENITIVE, NPF_SINGULAR }] = StringSet{L"и"};
result[NounTuple{ ND_1_SOFT, NGC_P3_DATIVE, NPF_SINGULAR }] = StringSet{L"е"};
result[NounTuple{ ND_1_SOFT, NGC_P4_ACCUSATIVE, NPF_SINGULAR }] = StringSet{L"у"};
result[NounTuple{ ND_1_SOFT, NGC_P5_INSTRUMENTAL, NPF_SINGULAR }] = StringSet{ L"ой", L"ою" };
result[NounTuple{ ND_1_SOFT, NGC_P6_PREPOSITIONAL, NPF_SINGULAR }] = StringSet{L"е"};
result[NounTuple{ ND_1_HARD, NGC_P1_NOMINATIVE, NPF_SINGULAR }] = StringSet{L"а"};
result[NounTuple{ ND_1_HARD, NGC_P2_GENITIVE, NPF_SINGULAR }] = StringSet{L"ы"};
result[NounTuple{ ND_1_HARD, NGC_P3_DATIVE, NPF_SINGULAR }] = StringSet{L"е"};
result[NounTuple{ ND_1_HARD, NGC_P4_ACCUSATIVE, NPF_SINGULAR }] = StringSet{L"у"};
result[NounTuple{ ND_1_HARD, NGC_P5_INSTRUMENTAL, NPF_SINGULAR }] = StringSet{ L"ой", L"ою", L"ей", L"ею" };
result[NounTuple{ ND_1_HARD, NGC_P6_PREPOSITIONAL, NPF_SINGULAR }] = StringSet{L"е"};
result[NounTuple{ ND_2_HARD_MALE, NGC_P1_NOMINATIVE, NPF_SINGULAR }] = StringSet{L""};
result[NounTuple{ ND_2_HARD_MALE, NGC_P2_GENITIVE, NPF_SINGULAR }] = StringSet{L"а"};
result[NounTuple{ ND_2_HARD_MALE, NGC_P3_DATIVE, NPF_SINGULAR }] = StringSet{L"у"};
result[NounTuple{ ND_2_HARD_MALE, NGC_P4_ACCUSATIVE, NPF_SINGULAR }] = StringSet{L""};
result[NounTuple{ ND_2_HARD_MALE, NGC_P5_INSTRUMENTAL, NPF_SINGULAR }] = StringSet{L"ом"};
result[NounTuple{ ND_2_HARD_MALE, NGC_P6_PREPOSITIONAL, NPF_SINGULAR }] = StringSet{L"е"};
result[NounTuple{ ND_2_SOFT_MALE, NGC_P1_NOMINATIVE, NPF_SINGULAR }] = StringSet{L"ь"};
result[NounTuple{ ND_2_SOFT_MALE, NGC_P2_GENITIVE, NPF_SINGULAR }] = StringSet{L"я"};
result[NounTuple{ ND_2_SOFT_MALE, NGC_P3_DATIVE, NPF_SINGULAR }] = StringSet{L"ю"};
result[NounTuple{ ND_2_SOFT_MALE, NGC_P4_ACCUSATIVE, NPF_SINGULAR }] = StringSet{L"ь"};
result[NounTuple{ ND_2_SOFT_MALE, NGC_P5_INSTRUMENTAL, NPF_SINGULAR }] = StringSet{L"ем"};
result[NounTuple{ ND_2_SOFT_MALE, NGC_P6_PREPOSITIONAL, NPF_SINGULAR }] = StringSet{L"е"};
result[NounTuple{ ND_2_NEUTER_O, NGC_P1_NOMINATIVE, NPF_SINGULAR }] = StringSet{L"о"};
result[NounTuple{ ND_2_NEUTER_O, NGC_P2_GENITIVE, NPF_SINGULAR }] = StringSet{L"а"};
result[NounTuple{ ND_2_NEUTER_O, NGC_P3_DATIVE, NPF_SINGULAR }] = StringSet{L"у"};
result[NounTuple{ ND_2_NEUTER_O, NGC_P4_ACCUSATIVE, NPF_SINGULAR }] = StringSet{L"о"};
result[NounTuple{ ND_2_NEUTER_O, NGC_P5_INSTRUMENTAL, NPF_SINGULAR }] = StringSet{L"ом"};
result[NounTuple{ ND_2_NEUTER_O, NGC_P6_PREPOSITIONAL, NPF_SINGULAR }] = StringSet{L"е"};
result[NounTuple{ ND_2_NEUTER_E, NGC_P1_NOMINATIVE, NPF_SINGULAR }] = StringSet{L"е"};
result[NounTuple{ ND_2_NEUTER_E, NGC_P2_GENITIVE, NPF_SINGULAR }] = StringSet{L"я"};
result[NounTuple{ ND_2_NEUTER_E, NGC_P3_DATIVE, NPF_SINGULAR }] = StringSet{L"ю"};
result[NounTuple{ ND_2_NEUTER_E, NGC_P4_ACCUSATIVE, NPF_SINGULAR }] = StringSet{L"е"};
result[NounTuple{ ND_2_NEUTER_E, NGC_P5_INSTRUMENTAL, NPF_SINGULAR }] = StringSet{L"ем"};
result[NounTuple{ ND_2_NEUTER_E, NGC_P6_PREPOSITIONAL, NPF_SINGULAR }] = StringSet{L"е"};
result[NounTuple{ ND_3, NGC_P1_NOMINATIVE, NPF_SINGULAR }] = StringSet{L"ь"};
result[NounTuple{ ND_3, NGC_P2_GENITIVE, NPF_SINGULAR }] = StringSet{L"и"};
result[NounTuple{ ND_3, NGC_P3_DATIVE, NPF_SINGULAR }] = StringSet{L"и"};
result[NounTuple{ ND_3, NGC_P4_ACCUSATIVE, NPF_SINGULAR }] = StringSet{L"ь"};
result[NounTuple{ ND_3, NGC_P5_INSTRUMENTAL, NPF_SINGULAR }] = StringSet{L"ью"};
result[NounTuple{ ND_3, NGC_P6_PREPOSITIONAL, NPF_SINGULAR }] = StringSet{L"и"};
//Plural
result[NounTuple{ ND_1_SOFT, NGC_P1_NOMINATIVE, NPF_PLURAL }] = StringSet{L"и"};
result[NounTuple{ ND_1_SOFT, NGC_P2_GENITIVE, NPF_PLURAL }] = StringSet{L""}; //Xperimental -- need special modificator for suffix
result[NounTuple{ ND_1_SOFT, NGC_P3_DATIVE, NPF_PLURAL }] = StringSet{L"ам"};
result[NounTuple{ ND_1_SOFT, NGC_P4_ACCUSATIVE, NPF_PLURAL }] = StringSet{L"и"};
result[NounTuple{ ND_1_SOFT, NGC_P5_INSTRUMENTAL, NPF_PLURAL }] = StringSet{L"ами"};
result[NounTuple{ ND_1_SOFT, NGC_P6_PREPOSITIONAL, NPF_PLURAL }] = StringSet{L"ах"};
result[NounTuple{ ND_1_HARD, NGC_P1_NOMINATIVE, NPF_PLURAL }] = StringSet{L"и"};
result[NounTuple{ ND_1_HARD, NGC_P2_GENITIVE, NPF_PLURAL }] = StringSet{L""};
result[NounTuple{ ND_1_HARD, NGC_P3_DATIVE, NPF_PLURAL }] = StringSet{L"ам"};
result[NounTuple{ ND_1_HARD, NGC_P4_ACCUSATIVE, NPF_PLURAL }] = StringSet{L""};
result[NounTuple{ ND_1_HARD, NGC_P5_INSTRUMENTAL, NPF_PLURAL }] = StringSet{L"ами"};
result[NounTuple{ ND_1_HARD, NGC_P6_PREPOSITIONAL, NPF_PLURAL }] = StringSet{L"ах"};
result[NounTuple{ ND_2_HARD_MALE, NGC_P1_NOMINATIVE, NPF_PLURAL }] = StringSet{L"ы"};
result[NounTuple{ ND_2_HARD_MALE, NGC_P2_GENITIVE, NPF_PLURAL }] = StringSet{L"ов"};
result[NounTuple{ ND_2_HARD_MALE, NGC_P3_DATIVE, NPF_PLURAL }] = StringSet{L"ам"};
result[NounTuple{ ND_2_HARD_MALE, NGC_P4_ACCUSATIVE, NPF_PLURAL }] = StringSet{L"ы"};
result[NounTuple{ ND_2_HARD_MALE, NGC_P5_INSTRUMENTAL, NPF_PLURAL }] = StringSet{L"ами"};
result[NounTuple{ ND_2_HARD_MALE, NGC_P6_PREPOSITIONAL, NPF_PLURAL }] = StringSet{L"ах"};
result[NounTuple{ ND_2_SOFT_MALE, NGC_P1_NOMINATIVE, NPF_PLURAL }] = StringSet{L"и"};
result[NounTuple{ ND_2_SOFT_MALE, NGC_P2_GENITIVE, NPF_PLURAL }] = StringSet{L"ей"};
result[NounTuple{ ND_2_SOFT_MALE, NGC_P3_DATIVE, NPF_PLURAL }] = StringSet{L"ям"};
result[NounTuple{ ND_2_SOFT_MALE, NGC_P4_ACCUSATIVE, NPF_PLURAL }] = StringSet{L"и"};
result[NounTuple{ ND_2_SOFT_MALE, NGC_P5_INSTRUMENTAL, NPF_PLURAL }] = StringSet{L"и"};
result[NounTuple{ ND_2_SOFT_MALE, NGC_P6_PREPOSITIONAL, NPF_PLURAL }] = StringSet{L"ях"};
result[NounTuple{ ND_2_NEUTER_O, NGC_P1_NOMINATIVE, NPF_PLURAL }] = StringSet{L"а"};
result[NounTuple{ ND_2_NEUTER_O, NGC_P2_GENITIVE, NPF_PLURAL }] = StringSet{L""};
result[NounTuple{ ND_2_NEUTER_O, NGC_P3_DATIVE, NPF_PLURAL }] = StringSet{L"ам"};
result[NounTuple{ ND_2_NEUTER_O, NGC_P4_ACCUSATIVE, NPF_PLURAL }] = StringSet{L"а"};
result[NounTuple{ ND_2_NEUTER_O, NGC_P5_INSTRUMENTAL, NPF_PLURAL }] = StringSet{L"ами"};
result[NounTuple{ ND_2_NEUTER_O, NGC_P6_PREPOSITIONAL, NPF_PLURAL }] = StringSet{L"ах"};
result[NounTuple{ ND_2_NEUTER_E, NGC_P1_NOMINATIVE, NPF_PLURAL }] = StringSet{L"я"};
result[NounTuple{ ND_2_NEUTER_E, NGC_P2_GENITIVE, NPF_PLURAL }] = StringSet{L"ей"};
result[NounTuple{ ND_2_NEUTER_E, NGC_P3_DATIVE, NPF_PLURAL }] = StringSet{L"ям"};
result[NounTuple{ ND_2_NEUTER_E, NGC_P4_ACCUSATIVE, NPF_PLURAL }] = StringSet{L"я"};
result[NounTuple{ ND_2_NEUTER_E, NGC_P5_INSTRUMENTAL, NPF_PLURAL }] = StringSet{L"ями"};
result[NounTuple{ ND_2_NEUTER_E, NGC_P6_PREPOSITIONAL, NPF_PLURAL }] = StringSet{L"ях"};
result[NounTuple{ ND_3, NGC_P1_NOMINATIVE, NPF_PLURAL }] = StringSet{L"и"};
result[NounTuple{ ND_3, NGC_P2_GENITIVE, NPF_PLURAL }] = StringSet{L"ей"};
result[NounTuple{ ND_3, NGC_P3_DATIVE, NPF_PLURAL }] = StringSet{L"ям"};
result[NounTuple{ ND_3, NGC_P4_ACCUSATIVE, NPF_PLURAL }] = StringSet{L"и"};
result[NounTuple{ ND_3, NGC_P5_INSTRUMENTAL, NPF_PLURAL }] = StringSet{L"ями"};
result[NounTuple{ ND_3, NGC_P6_PREPOSITIONAL, NPF_PLURAL }] = StringSet{L"ях"};
return result;
}
bool NounIsInDictionary(std::wstring nounNominative)
{
std::cout <<frequentWordSet.size() << std::endl;
std::cout << "$$" << UTF16to8(frequentWordSet.begin()->c_str()) << std::endl;
std::cout <<"$$" << UTF16to8(nounNominative.c_str()) << std::endl;
std::cout << "count" << frequentWordSet.count(nounNominative) << std::endl;
std::wstring huy = L"баран";
std::cout <<"h" << (int)huy[0] << " " << (int)huy[1] << " " << (int)huy[2] << " " << (int)huy[3] << " " << (int)huy[4] << std::endl;
std::cout << "n" << (int)nounNominative[0] << " " << (int)nounNominative[1] << " " << (int)nounNominative[2] << " " << (int)nounNominative[3] << " " << (int)nounNominative[4] << std::endl;
if (nounNominative == huy)
{
std::cout <<"true!" << std::endl;
}
else
{
std::cout << "false!" << std::endl;
}
if (frequentWordSet.count(nounNominative) != 0)
{
return true;
}
return false;
}
std::set<NounDeclencion> GetPossibleNounDeclencionSet(std::wstring nounNominative)
{
if (nounNominative.size() <= 1)
{
//Xperimental -- need to say that word is too short!
return{};
}
wchar_t lastChar = nounNominative[nounNominative.size()-1];
wchar_t prevLastChar = nounNominative[nounNominative.size() - 2];
if (lastChar == L'а')
{
return{ ND_1_HARD, ND_1_SOFT };
}
if (lastChar == L'о')
{
return{ ND_2_NEUTER_O };
}
if (lastChar == L'е')
{
return{ ND_2_NEUTER_E };
}
if (lastChar == L'ь')
{
return{ ND_2_SOFT_MALE };
}
return{ ND_2_HARD_MALE };
}
bool charIsConsolant(wchar_t c)
{
std::wstring consolants = L"йцкнгшщзхфвпрлджчсмтб";
for (wchar_t ic : consolants)
{
if (c == ic)
{
return true;
}
}
return false;
}
bool charIsVowel(wchar_t c)
{
std::wstring vovels = L"аоуыэяёюие";
for (wchar_t ic : vovels)
{
if (c == ic)
{
return true;
}
}
return false;
}
std::vector<std::pair<std::wstring, std::wstring>> getPossibleNounEndingDivisionArr(std::wstring noun)
{
std::vector<std::pair<std::wstring, std::wstring>> result;
auto allNounEndingArr = GetAllNounEndingArr();
for (auto ending : allNounEndingArr)
{
if (boost::ends_with(noun, ending))
{
std::wstring nounBase = boost::replace_last_copy(noun, ending, "");
if (charIsConsolant(nounBase[nounBase.size() - 1]))
{
result.push_back({ nounBase, ending});
}
}
}
return result;
}
std::vector<NounTuple> GetPossibleNounTupleArr(std::wstring nounEnding)
{
std::vector<NounTuple> result;
auto nounEndingTable = getNounEndingTable();
for (auto i : nounEndingTable)
{
if (i.second.count(nounEnding) != 0)
{
result.push_back(i.first);
}
}
return result;
}
std::vector<NounTuple> FilterNounTupleArrByNounDeclentionSet(std::vector<NounTuple> nounTupleArr, std::set<NounDeclencion> filter)
{
std::vector<NounTuple> result;
for (auto nounTuple : nounTupleArr)
{
if (filter.count(std::get<0>(nounTuple)) != 0)
{
result.push_back(nounTuple);
}
}
return result;
}
std::wstring RestoreNounByTuple(std::wstring nounBase, NounTuple nounTuple)
{
auto nounEndingTable = getNounEndingTable();
NounTuple nominativeNounTuple{ std::get<0>(nounTuple), NGC_P1_NOMINATIVE, NPF_SINGULAR };
auto nounEndingSet = nounEndingTable[nominativeNounTuple];
if (nounEndingSet.size() != 1)
{
//throw std::exception("There is problem - noun have more than 1 form!");
}
return nounBase + *(nounEndingTable[nominativeNounTuple].begin());
}
std::vector<NounStruct> RecognizeNoun(std::wstring noun)
{
std::cout << "!" << UTF16to8(noun.c_str()) << std::endl;
std::cout << "?" << UTF16to8(frequentWordSet.begin()->c_str()) <<std::endl;
std::vector<NounStruct> result;
auto nounEndingDivisionArr = getPossibleNounEndingDivisionArr(noun);
std::cout << nounEndingDivisionArr.size() << std::endl;
for (auto nounEndingDivision : nounEndingDivisionArr)
{
std::wstring nounBase = nounEndingDivision.first;
std::wstring nounEnding = nounEndingDivision.second;
std::vector<NounTuple> possibleTupleArr = GetPossibleNounTupleArr(nounEnding);
std::cout << "BASE" << UTF16to8(nounBase.c_str()) << std::endl;
for (auto nounTuple : possibleTupleArr)
{
std::wstring nounNominative = RestoreNounByTuple(nounBase, nounTuple);
std::cout <<"Nominative" << UTF16to8(nounNominative.c_str()) << std::endl;
auto possibleNounDetectionSet = GetPossibleNounDeclencionSet(nounNominative);
std::cout <<"setsize" << possibleNounDetectionSet.size() << std::endl;
if (possibleNounDetectionSet.count(std::get<0>(nounTuple)) != 0)
{
std::cout<<"if1" << std::endl;
if (NounIsInDictionary(nounNominative))
{
std::cout <<"result1 go!" << std::endl;
result.push_back({ nounTuple, nounNominative });
}
}
}
}
return result;
}
void LoadFrequentWordSet()
{
#ifdef _WIN32
std::ifstream f("C:/Workplace/ChineseJournal/rudict/frequent_words.txt");
#else
std::ifstream f("/home/devuser/workplace/rudict/frequent_words.txt");
#endif
//f.imbue(std::locale(std::locale::empty(), new std::codecvt_utf8<wchar_t>));
std::string line;
std::wstring wline;
if (f.is_open())
{
std::cout<<"File found!" << std::endl;
while (getline(f, line))
{
wline = UTF8to16(line.c_str());
frequentWordSet.insert(wline);
}
f.close();
}
else
{
std::cout <<"file not found!" << std::endl;
}
}