chinese-journal/rudict/rudict/noun.cpp
2014-11-26 22:22:33 +00:00

396 lines
13 KiB
C++

#include "noun.h"
#include <iostream> //Xperimental -- for debug only
#include "utf8utf16.h"
std::wstring NounDeclencionToWString(NounDeclencion nounDeclencion)
{
switch (nounDeclencion)
{
case ND_1_HARD: return L"First declencion (hard type), female";
case ND_1_SOFT: return L"First declencion (soft type), female";
case ND_2_HARD_MALE: return L"Second declencion (hard type), male";
case ND_2_SOFT_MALE: return L"Second declencion (soft type), male";
case ND_2_NEUTER_O: return L"Second declencion, E-ending, neuter";
case ND_2_NEUTER_E: return L"Second declencion, O-ending, neuter";
case ND_3: return L"Third declencion, female";
}
return L"";
}
std::wstring NounGrammaticalCaseToWString(NounGrammaticalCase nounGrammaticalCase)
{
switch (nounGrammaticalCase)
{
case NGC_P1_NOMINATIVE: return L"Nominative case";
case NGC_P2_GENITIVE: return L"Genitive case";
case NGC_P3_DATIVE: return L"Dative case";
case NGC_P4_ACCUSATIVE: return L"Accusative case";
case NGC_P5_INSTRUMENTAL: return L"Instrumental case";
case NGC_P6_PREPOSITIONAL: return L"Prepositional case";
}
return L"";
}
std::wstring NounNumberToWString(NounNumber nounNumber)
{
switch (nounNumber)
{
case NPF_SINGULAR: return L"Singular form";
case NPF_PLURAL: return L"Plural form";
}
return L"";
}
std::set<std::wstring> frequentWordSet;
std::vector<std::wstring> GetAllNounEndingArr()
{
std::vector<std::wstring> result
{
L"",
L"à",
L"è",
L"å",
L"ó",
L"îé",
L"û",
L"îì",
L"ü",
L"ÿ",
L"þ",
L"åì",
L"î",
L"üþ",
L"àì",
L"àìè",
L"àõ",
L"îâ",
L"åé",
L"ÿì",
L"ÿõ",
L"ÿ",
L"ÿìè",
};
return result;
}
std::map<NounTuple, StringSet> getNounEndingTable()
{
std::map<NounTuple, StringSet> result;
//Singular
result[NounTuple{ ND_1_SOFT, NGC_P1_NOMINATIVE, NPF_SINGULAR }] = StringSet{ L"à" };
result[NounTuple{ ND_1_SOFT, NGC_P2_GENITIVE, NPF_SINGULAR }] = StringSet{L"è"};
result[NounTuple{ ND_1_SOFT, NGC_P3_DATIVE, NPF_SINGULAR }] = StringSet{L"å"};
result[NounTuple{ ND_1_SOFT, NGC_P4_ACCUSATIVE, NPF_SINGULAR }] = StringSet{L"ó"};
result[NounTuple{ ND_1_SOFT, NGC_P5_INSTRUMENTAL, NPF_SINGULAR }] = StringSet{ L"îé", L"îþ" };
result[NounTuple{ ND_1_SOFT, NGC_P6_PREPOSITIONAL, NPF_SINGULAR }] = StringSet{L"å"};
result[NounTuple{ ND_1_HARD, NGC_P1_NOMINATIVE, NPF_SINGULAR }] = StringSet{L"à"};
result[NounTuple{ ND_1_HARD, NGC_P2_GENITIVE, NPF_SINGULAR }] = StringSet{L"û"};
result[NounTuple{ ND_1_HARD, NGC_P3_DATIVE, NPF_SINGULAR }] = StringSet{L"å"};
result[NounTuple{ ND_1_HARD, NGC_P4_ACCUSATIVE, NPF_SINGULAR }] = StringSet{L"ó"};
result[NounTuple{ ND_1_HARD, NGC_P5_INSTRUMENTAL, NPF_SINGULAR }] = StringSet{ L"îé", L"îþ", L"åé", L"åþ" };
result[NounTuple{ ND_1_HARD, NGC_P6_PREPOSITIONAL, NPF_SINGULAR }] = StringSet{L"å"};
result[NounTuple{ ND_2_HARD_MALE, NGC_P1_NOMINATIVE, NPF_SINGULAR }] = StringSet{L""};
result[NounTuple{ ND_2_HARD_MALE, NGC_P2_GENITIVE, NPF_SINGULAR }] = StringSet{L"à"};
result[NounTuple{ ND_2_HARD_MALE, NGC_P3_DATIVE, NPF_SINGULAR }] = StringSet{L"ó"};
result[NounTuple{ ND_2_HARD_MALE, NGC_P4_ACCUSATIVE, NPF_SINGULAR }] = StringSet{L""};
result[NounTuple{ ND_2_HARD_MALE, NGC_P5_INSTRUMENTAL, NPF_SINGULAR }] = StringSet{L"îì"};
result[NounTuple{ ND_2_HARD_MALE, NGC_P6_PREPOSITIONAL, NPF_SINGULAR }] = StringSet{L"å"};
result[NounTuple{ ND_2_SOFT_MALE, NGC_P1_NOMINATIVE, NPF_SINGULAR }] = StringSet{L"ü"};
result[NounTuple{ ND_2_SOFT_MALE, NGC_P2_GENITIVE, NPF_SINGULAR }] = StringSet{L"ÿ"};
result[NounTuple{ ND_2_SOFT_MALE, NGC_P3_DATIVE, NPF_SINGULAR }] = StringSet{L"þ"};
result[NounTuple{ ND_2_SOFT_MALE, NGC_P4_ACCUSATIVE, NPF_SINGULAR }] = StringSet{L"ü"};
result[NounTuple{ ND_2_SOFT_MALE, NGC_P5_INSTRUMENTAL, NPF_SINGULAR }] = StringSet{L"åì"};
result[NounTuple{ ND_2_SOFT_MALE, NGC_P6_PREPOSITIONAL, NPF_SINGULAR }] = StringSet{L"å"};
result[NounTuple{ ND_2_NEUTER_O, NGC_P1_NOMINATIVE, NPF_SINGULAR }] = StringSet{L"î"};
result[NounTuple{ ND_2_NEUTER_O, NGC_P2_GENITIVE, NPF_SINGULAR }] = StringSet{L"à"};
result[NounTuple{ ND_2_NEUTER_O, NGC_P3_DATIVE, NPF_SINGULAR }] = StringSet{L"ó"};
result[NounTuple{ ND_2_NEUTER_O, NGC_P4_ACCUSATIVE, NPF_SINGULAR }] = StringSet{L"î"};
result[NounTuple{ ND_2_NEUTER_O, NGC_P5_INSTRUMENTAL, NPF_SINGULAR }] = StringSet{L"îì"};
result[NounTuple{ ND_2_NEUTER_O, NGC_P6_PREPOSITIONAL, NPF_SINGULAR }] = StringSet{L"å"};
result[NounTuple{ ND_2_NEUTER_E, NGC_P1_NOMINATIVE, NPF_SINGULAR }] = StringSet{L"å"};
result[NounTuple{ ND_2_NEUTER_E, NGC_P2_GENITIVE, NPF_SINGULAR }] = StringSet{L"ÿ"};
result[NounTuple{ ND_2_NEUTER_E, NGC_P3_DATIVE, NPF_SINGULAR }] = StringSet{L"þ"};
result[NounTuple{ ND_2_NEUTER_E, NGC_P4_ACCUSATIVE, NPF_SINGULAR }] = StringSet{L"å"};
result[NounTuple{ ND_2_NEUTER_E, NGC_P5_INSTRUMENTAL, NPF_SINGULAR }] = StringSet{L"åì"};
result[NounTuple{ ND_2_NEUTER_E, NGC_P6_PREPOSITIONAL, NPF_SINGULAR }] = StringSet{L"å"};
result[NounTuple{ ND_3, NGC_P1_NOMINATIVE, NPF_SINGULAR }] = StringSet{L"ü"};
result[NounTuple{ ND_3, NGC_P2_GENITIVE, NPF_SINGULAR }] = StringSet{L"è"};
result[NounTuple{ ND_3, NGC_P3_DATIVE, NPF_SINGULAR }] = StringSet{L"è"};
result[NounTuple{ ND_3, NGC_P4_ACCUSATIVE, NPF_SINGULAR }] = StringSet{L"ü"};
result[NounTuple{ ND_3, NGC_P5_INSTRUMENTAL, NPF_SINGULAR }] = StringSet{L"üþ"};
result[NounTuple{ ND_3, NGC_P6_PREPOSITIONAL, NPF_SINGULAR }] = StringSet{L"è"};
//Plural
result[NounTuple{ ND_1_SOFT, NGC_P1_NOMINATIVE, NPF_PLURAL }] = StringSet{L"è"};
result[NounTuple{ ND_1_SOFT, NGC_P2_GENITIVE, NPF_PLURAL }] = StringSet{L""}; //Xperimental -- need special modificator for suffix
result[NounTuple{ ND_1_SOFT, NGC_P3_DATIVE, NPF_PLURAL }] = StringSet{L"àì"};
result[NounTuple{ ND_1_SOFT, NGC_P4_ACCUSATIVE, NPF_PLURAL }] = StringSet{L"è"};
result[NounTuple{ ND_1_SOFT, NGC_P5_INSTRUMENTAL, NPF_PLURAL }] = StringSet{L"àìè"};
result[NounTuple{ ND_1_SOFT, NGC_P6_PREPOSITIONAL, NPF_PLURAL }] = StringSet{L"àõ"};
result[NounTuple{ ND_1_HARD, NGC_P1_NOMINATIVE, NPF_PLURAL }] = StringSet{L"è"};
result[NounTuple{ ND_1_HARD, NGC_P2_GENITIVE, NPF_PLURAL }] = StringSet{L""};
result[NounTuple{ ND_1_HARD, NGC_P3_DATIVE, NPF_PLURAL }] = StringSet{L"àì"};
result[NounTuple{ ND_1_HARD, NGC_P4_ACCUSATIVE, NPF_PLURAL }] = StringSet{L""};
result[NounTuple{ ND_1_HARD, NGC_P5_INSTRUMENTAL, NPF_PLURAL }] = StringSet{L"àìè"};
result[NounTuple{ ND_1_HARD, NGC_P6_PREPOSITIONAL, NPF_PLURAL }] = StringSet{L"àõ"};
result[NounTuple{ ND_2_HARD_MALE, NGC_P1_NOMINATIVE, NPF_PLURAL }] = StringSet{L"û"};
result[NounTuple{ ND_2_HARD_MALE, NGC_P2_GENITIVE, NPF_PLURAL }] = StringSet{L"îâ"};
result[NounTuple{ ND_2_HARD_MALE, NGC_P3_DATIVE, NPF_PLURAL }] = StringSet{L"àì"};
result[NounTuple{ ND_2_HARD_MALE, NGC_P4_ACCUSATIVE, NPF_PLURAL }] = StringSet{L"û"};
result[NounTuple{ ND_2_HARD_MALE, NGC_P5_INSTRUMENTAL, NPF_PLURAL }] = StringSet{L"àìè"};
result[NounTuple{ ND_2_HARD_MALE, NGC_P6_PREPOSITIONAL, NPF_PLURAL }] = StringSet{L"àõ"};
result[NounTuple{ ND_2_SOFT_MALE, NGC_P1_NOMINATIVE, NPF_PLURAL }] = StringSet{L"è"};
result[NounTuple{ ND_2_SOFT_MALE, NGC_P2_GENITIVE, NPF_PLURAL }] = StringSet{L"åé"};
result[NounTuple{ ND_2_SOFT_MALE, NGC_P3_DATIVE, NPF_PLURAL }] = StringSet{L"ÿì"};
result[NounTuple{ ND_2_SOFT_MALE, NGC_P4_ACCUSATIVE, NPF_PLURAL }] = StringSet{L"è"};
result[NounTuple{ ND_2_SOFT_MALE, NGC_P5_INSTRUMENTAL, NPF_PLURAL }] = StringSet{L"è"};
result[NounTuple{ ND_2_SOFT_MALE, NGC_P6_PREPOSITIONAL, NPF_PLURAL }] = StringSet{L"ÿõ"};
result[NounTuple{ ND_2_NEUTER_O, NGC_P1_NOMINATIVE, NPF_PLURAL }] = StringSet{L"à"};
result[NounTuple{ ND_2_NEUTER_O, NGC_P2_GENITIVE, NPF_PLURAL }] = StringSet{L""};
result[NounTuple{ ND_2_NEUTER_O, NGC_P3_DATIVE, NPF_PLURAL }] = StringSet{L"àì"};
result[NounTuple{ ND_2_NEUTER_O, NGC_P4_ACCUSATIVE, NPF_PLURAL }] = StringSet{L"à"};
result[NounTuple{ ND_2_NEUTER_O, NGC_P5_INSTRUMENTAL, NPF_PLURAL }] = StringSet{L"àìè"};
result[NounTuple{ ND_2_NEUTER_O, NGC_P6_PREPOSITIONAL, NPF_PLURAL }] = StringSet{L"àõ"};
result[NounTuple{ ND_2_NEUTER_E, NGC_P1_NOMINATIVE, NPF_PLURAL }] = StringSet{L"ÿ"};
result[NounTuple{ ND_2_NEUTER_E, NGC_P2_GENITIVE, NPF_PLURAL }] = StringSet{L"åé"};
result[NounTuple{ ND_2_NEUTER_E, NGC_P3_DATIVE, NPF_PLURAL }] = StringSet{L"ÿì"};
result[NounTuple{ ND_2_NEUTER_E, NGC_P4_ACCUSATIVE, NPF_PLURAL }] = StringSet{L"ÿ"};
result[NounTuple{ ND_2_NEUTER_E, NGC_P5_INSTRUMENTAL, NPF_PLURAL }] = StringSet{L"ÿìè"};
result[NounTuple{ ND_2_NEUTER_E, NGC_P6_PREPOSITIONAL, NPF_PLURAL }] = StringSet{L"ÿõ"};
result[NounTuple{ ND_3, NGC_P1_NOMINATIVE, NPF_PLURAL }] = StringSet{L"è"};
result[NounTuple{ ND_3, NGC_P2_GENITIVE, NPF_PLURAL }] = StringSet{L"åé"};
result[NounTuple{ ND_3, NGC_P3_DATIVE, NPF_PLURAL }] = StringSet{L"ÿì"};
result[NounTuple{ ND_3, NGC_P4_ACCUSATIVE, NPF_PLURAL }] = StringSet{L"è"};
result[NounTuple{ ND_3, NGC_P5_INSTRUMENTAL, NPF_PLURAL }] = StringSet{L"ÿìè"};
result[NounTuple{ ND_3, NGC_P6_PREPOSITIONAL, NPF_PLURAL }] = StringSet{L"ÿõ"};
return result;
}
bool NounIsInDictionary(std::wstring nounNominative)
{
if (frequentWordSet.count(nounNominative) != 0)
{
return true;
}
return false;
}
std::set<NounDeclencion> GetPossibleNounDeclencionSet(std::wstring nounNominative)
{
if (nounNominative.size() <= 1)
{
//Xperimental -- need to say that word is too short!
return{};
}
wchar_t lastChar = nounNominative[nounNominative.size()-1];
wchar_t prevLastChar = nounNominative[nounNominative.size() - 2];
if (lastChar == L'à')
{
return{ ND_1_HARD, ND_1_SOFT };
}
if (lastChar == L'î')
{
return{ ND_2_NEUTER_O };
}
if (lastChar == L'å')
{
return{ ND_2_NEUTER_E };
}
if (lastChar == L'ü')
{
return{ ND_2_SOFT_MALE };
}
return{ ND_2_HARD_MALE };
}
bool charIsConsolant(wchar_t c)
{
std::wstring consolants = L"éöêíãøùçõôâïðëäæ÷ñìòá";
for (wchar_t ic : consolants)
{
if (c == ic)
{
return true;
}
}
return false;
}
bool charIsVowel(wchar_t c)
{
std::wstring vovels = L"àîóûýÿ¸þèå";
for (wchar_t ic : vovels)
{
if (c == ic)
{
return true;
}
}
return false;
}
std::vector<std::pair<std::wstring, std::wstring>> getPossibleNounEndingDivisionArr(std::wstring noun)
{
std::vector<std::pair<std::wstring, std::wstring>> result;
auto allNounEndingArr = GetAllNounEndingArr();
for (auto ending : allNounEndingArr)
{
if (boost::ends_with(noun, ending))
{
std::wstring nounBase = boost::replace_last_copy(noun, ending, "");
if (charIsConsolant(nounBase[nounBase.size() - 1]))
{
result.push_back({ nounBase, ending});
}
}
}
return result;
}
std::vector<NounTuple> GetPossibleNounTupleArr(std::wstring nounEnding)
{
std::vector<NounTuple> result;
auto nounEndingTable = getNounEndingTable();
for (auto i : nounEndingTable)
{
if (i.second.count(nounEnding) != 0)
{
result.push_back(i.first);
}
}
return result;
}
std::vector<NounTuple> FilterNounTupleArrByNounDeclentionSet(std::vector<NounTuple> nounTupleArr, std::set<NounDeclencion> filter)
{
std::vector<NounTuple> result;
for (auto nounTuple : nounTupleArr)
{
if (filter.count(std::get<0>(nounTuple)) != 0)
{
result.push_back(nounTuple);
}
}
return result;
}
std::wstring RestoreNounByTuple(std::wstring nounBase, NounTuple nounTuple)
{
auto nounEndingTable = getNounEndingTable();
NounTuple nominativeNounTuple{ std::get<0>(nounTuple), NGC_P1_NOMINATIVE, NPF_SINGULAR };
auto nounEndingSet = nounEndingTable[nominativeNounTuple];
if (nounEndingSet.size() != 1)
{
//throw std::exception("There is problem - noun have more than 1 form!");
}
return nounBase + *(nounEndingTable[nominativeNounTuple].begin());
}
std::vector<NounStruct> RecognizeNoun(std::wstring noun)
{
std::vector<NounStruct> result;
auto nounEndingDivisionArr = getPossibleNounEndingDivisionArr(noun);
for (auto nounEndingDivision : nounEndingDivisionArr)
{
std::wstring nounBase = nounEndingDivision.first;
std::wstring nounEnding = nounEndingDivision.second;
std::vector<NounTuple> possibleTupleArr = GetPossibleNounTupleArr(nounEnding);
for (auto nounTuple : possibleTupleArr)
{
std::wstring nounNominative = RestoreNounByTuple(nounBase, nounTuple);
auto possibleNounDetectionSet = GetPossibleNounDeclencionSet(nounNominative);
if (possibleNounDetectionSet.count(std::get<0>(nounTuple)) != 0)
{
if (NounIsInDictionary(nounNominative))
{
result.push_back({ nounTuple, nounNominative });
}
}
}
}
return result;
}
void LoadFrequentWordSet()
{
std::ifstream f("C:/Workplace/ChineseJournal/rudict/frequent_words.txt");
//f.imbue(std::locale(std::locale::empty(), new std::codecvt_utf8<wchar_t>));
std::string line;
std::wstring wline;
if (f.is_open())
{
while (getline(f, line))
{
wline = UTF8to16(line.c_str());
frequentWordSet.insert(wline);
}
f.close();
}
}