utf8 dammit

This commit is contained in:
Vladislav Khorev 2014-11-27 09:45:52 +00:00
parent b1878507f9
commit 7cf1a99d98
4 changed files with 126 additions and 111 deletions

View File

@ -17,7 +17,7 @@
#include "reply.hpp" #include "reply.hpp"
#include "request.hpp" #include "request.hpp"
#include "boost/algorithm/string.hpp" #include "boost/algorithm/string.hpp"
#include "boost/property_tree/json_parser.hpp" #include "boost/property_tree/json_parser.hpp"
#include "../utf8utf16.h" #include "../utf8utf16.h"
#include "../noun.h" #include "../noun.h"
@ -62,7 +62,7 @@ namespace http {
boost::to_lower(request_path); boost::to_lower(request_path);
std::wstring requestedStr = UTF8to16(request_path.c_str()); std::wstring requestedStr = string_to_wstring(request_path);
/* /*
requestedStr = L"Вы запросили: " + requestedStr; requestedStr = L"Вы запросили: " + requestedStr;
@ -71,13 +71,13 @@ namespace http {
rep.content = "<html><body>" + rep.content + "</body></html>"; rep.content = "<html><body>" + rep.content + "</body></html>";
*/ */
boost::property_tree::wptree propertyTree = PrepareReport(requestedStr); boost::property_tree::wptree propertyTree = PrepareReport(requestedStr);
std::wstringstream output_stream; std::wstringstream output_stream;
boost::property_tree::write_json(output_stream, propertyTree); boost::property_tree::write_json(output_stream, propertyTree);
std::string outputJsonCode = UTF16to8(output_stream.str().c_str()); std::string outputJsonCode = wstring_to_string(output_stream.str());
rep.status = reply::ok; rep.status = reply::ok;

View File

@ -197,8 +197,8 @@ bool NounIsInDictionary(std::wstring nounNominative)
std::cout <<frequentWordSet.size() << std::endl; std::cout <<frequentWordSet.size() << std::endl;
std::cout << "$$" << UTF16to8(frequentWordSet.begin()->c_str()) << std::endl; std::cout << "$$" << wstring_to_string(*frequentWordSet.begin()) << std::endl;
std::cout <<"$$" << UTF16to8(nounNominative.c_str()) << std::endl; std::cout << "$$" << wstring_to_string(nounNominative) << std::endl;
std::cout << "count" << frequentWordSet.count(nounNominative) << std::endl; std::cout << "count" << frequentWordSet.count(nounNominative) << std::endl;
@ -366,9 +366,9 @@ std::wstring RestoreNounByTuple(std::wstring nounBase, NounTuple nounTuple)
std::vector<NounStruct> RecognizeNoun(std::wstring noun) std::vector<NounStruct> RecognizeNoun(std::wstring noun)
{ {
std::cout << "!" << UTF16to8(noun.c_str()) << std::endl; std::cout << "!" << wstring_to_string(noun) << std::endl;
std::cout << "?" << UTF16to8(frequentWordSet.begin()->c_str()) <<std::endl; std::cout << "?" << wstring_to_string(*frequentWordSet.begin()) << std::endl;
std::vector<NounStruct> result; std::vector<NounStruct> result;
@ -384,14 +384,14 @@ std::cout << nounEndingDivisionArr.size() << std::endl;
std::vector<NounTuple> possibleTupleArr = GetPossibleNounTupleArr(nounEnding); std::vector<NounTuple> possibleTupleArr = GetPossibleNounTupleArr(nounEnding);
std::cout << "BASE" << UTF16to8(nounBase.c_str()) << std::endl; std::cout << "BASE" << wstring_to_string(nounBase) << std::endl;
for (auto nounTuple : possibleTupleArr) for (auto nounTuple : possibleTupleArr)
{ {
std::wstring nounNominative = RestoreNounByTuple(nounBase, nounTuple); std::wstring nounNominative = RestoreNounByTuple(nounBase, nounTuple);
std::cout <<"Nominative" << UTF16to8(nounNominative.c_str()) << std::endl; std::cout << "Nominative" << wstring_to_string(nounNominative) << std::endl;
auto possibleNounDetectionSet = GetPossibleNounDeclencionSet(nounNominative); auto possibleNounDetectionSet = GetPossibleNounDeclencionSet(nounNominative);
@ -435,7 +435,7 @@ std::ifstream f("/home/devuser/workplace/rudict/frequent_words.txt");
std::cout<<"File found!" << std::endl; std::cout<<"File found!" << std::endl;
while (getline(f, line)) while (getline(f, line))
{ {
wline = UTF8to16(line.c_str()); wline = string_to_wstring(line);
frequentWordSet.insert(wline); frequentWordSet.insert(wline);
} }
f.close(); f.close();

View File

@ -1,81 +1,95 @@
#include "utf8utf16.h" #include "utf8utf16.h"
#include <string>
#include <string>
#include <boost/locale.hpp>
#include <locale>
std::string wstring_to_string(std::wstring in)
{
/*
std::string out;
unsigned int codepoint = 0;
for (in; *in != 0; ++in)
{
if (*in >= 0xd800 && *in <= 0xdbff)
codepoint = ((*in - 0xd800) << 10) + 0x10000;
else
{
if (*in >= 0xdc00 && *in <= 0xdfff)
codepoint |= *in - 0xdc00;
else
codepoint = *in;
if (codepoint <= 0x7f)
out.append(1, static_cast<char>(codepoint));
else if (codepoint <= 0x7ff)
{
out.append(1, static_cast<char>(0xc0 | ((codepoint >> 6) & 0x1f)));
out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
}
else if (codepoint <= 0xffff)
{
out.append(1, static_cast<char>(0xe0 | ((codepoint >> 12) & 0x0f)));
out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f)));
out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
}
else
{
out.append(1, static_cast<char>(0xf0 | ((codepoint >> 18) & 0x07)));
out.append(1, static_cast<char>(0x80 | ((codepoint >> 12) & 0x3f)));
out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f)));
out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
}
codepoint = 0;
}
}
return out;*/
std::string out = boost::locale::conv::utf_to_utf<char>(in);
return out;
std::string UTF16to8(const wchar_t * in)
{
std::string out;
unsigned int codepoint = 0;
for (in; *in != 0; ++in)
{
if (*in >= 0xd800 && *in <= 0xdbff)
codepoint = ((*in - 0xd800) << 10) + 0x10000;
else
{
if (*in >= 0xdc00 && *in <= 0xdfff)
codepoint |= *in - 0xdc00;
else
codepoint = *in;
if (codepoint <= 0x7f)
out.append(1, static_cast<char>(codepoint));
else if (codepoint <= 0x7ff)
{
out.append(1, static_cast<char>(0xc0 | ((codepoint >> 6) & 0x1f)));
out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
}
else if (codepoint <= 0xffff)
{
out.append(1, static_cast<char>(0xe0 | ((codepoint >> 12) & 0x0f)));
out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f)));
out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
}
else
{
out.append(1, static_cast<char>(0xf0 | ((codepoint >> 18) & 0x07)));
out.append(1, static_cast<char>(0x80 | ((codepoint >> 12) & 0x3f)));
out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f)));
out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
}
codepoint = 0;
}
}
return out;
} }
std::wstring UTF8to16(const char * in) std::wstring string_to_wstring(std::string in)
{ {
/*
std::wstring out; std::wstring out;
if (in == NULL) if (in == NULL)
return out; return out;
unsigned int codepoint; unsigned int codepoint;
while (*in != 0) while (*in != 0)
{ {
unsigned char ch = static_cast<unsigned char>(*in); unsigned char ch = static_cast<unsigned char>(*in);
if (ch <= 0x7f) if (ch <= 0x7f)
codepoint = ch; codepoint = ch;
else if (ch <= 0xbf) else if (ch <= 0xbf)
codepoint = (codepoint << 6) | (ch & 0x3f); codepoint = (codepoint << 6) | (ch & 0x3f);
else if (ch <= 0xdf) else if (ch <= 0xdf)
codepoint = ch & 0x1f; codepoint = ch & 0x1f;
else if (ch <= 0xef) else if (ch <= 0xef)
codepoint = ch & 0x0f; codepoint = ch & 0x0f;
else else
codepoint = ch & 0x07; codepoint = ch & 0x07;
++in; ++in;
if (((*in & 0xc0) != 0x80) && (codepoint <= 0x10ffff)) if (((*in & 0xc0) != 0x80) && (codepoint <= 0x10ffff))
{ {
if (codepoint > 0xffff) if (codepoint > 0xffff)
{ {
out.append(1, static_cast<wchar_t>(0xd800 + (codepoint >> 10))); out.append(1, static_cast<wchar_t>(0xd800 + (codepoint >> 10)));
out.append(1, static_cast<wchar_t>(0xdc00 + (codepoint & 0x03ff))); out.append(1, static_cast<wchar_t>(0xdc00 + (codepoint & 0x03ff)));
} }
else if (codepoint < 0xd800 || codepoint >= 0xe000) else if (codepoint < 0xd800 || codepoint >= 0xe000)
out.append(1, static_cast<wchar_t>(codepoint)); out.append(1, static_cast<wchar_t>(codepoint));
} }
} }
return out; return out;
}
*/
std::wstring out = boost::locale::conv::utf_to_utf<wchar_t>(in);
return out;
}

View File

@ -1,17 +1,18 @@
#ifndef UTF8UTF16_H_INCLUDED #ifndef UTF8UTF16_H_INCLUDED
#define UTF8UTF16_H_INCLUDED #define UTF8UTF16_H_INCLUDED
#include <iostream> #include <iostream>
#include <stdexcept> #include <stdexcept>
#include <vector> #include <vector>
#include <string> #include <string>
#include <cstring> #include <cstring>
#include <locale> #include <locale>
std::wstring UTF8to16(const char * in); std::string wstring_to_string(std::wstring in);
std::string UTF16to8(const wchar_t * in);
std::wstring string_to_wstring(std::string in);
#endif //UTF8UTF16_H_INCLUDED
#endif //UTF8UTF16_H_INCLUDED