From 7cf1a99d988aef48e291a3aa409c1d5de4dfa298 Mon Sep 17 00:00:00 2001 From: Vladislav Khorev Date: Thu, 27 Nov 2014 09:45:52 +0000 Subject: [PATCH] utf8 dammit --- rudict/rudict/http/request_handler.cpp | 18 +-- rudict/rudict/noun.cpp | 14 +- rudict/rudict/utf8utf16.cpp | 170 +++++++++++++------------ rudict/rudict/utf8utf16.h | 35 ++--- 4 files changed, 126 insertions(+), 111 deletions(-) diff --git a/rudict/rudict/http/request_handler.cpp b/rudict/rudict/http/request_handler.cpp index 18efacd..74a9b43 100644 --- a/rudict/rudict/http/request_handler.cpp +++ b/rudict/rudict/http/request_handler.cpp @@ -17,7 +17,7 @@ #include "reply.hpp" #include "request.hpp" -#include "boost/algorithm/string.hpp" +#include "boost/algorithm/string.hpp" #include "boost/property_tree/json_parser.hpp" #include "../utf8utf16.h" #include "../noun.h" @@ -62,7 +62,7 @@ namespace http { boost::to_lower(request_path); - std::wstring requestedStr = UTF8to16(request_path.c_str()); + std::wstring requestedStr = string_to_wstring(request_path); /* requestedStr = L"Вы запросили: " + requestedStr; @@ -71,13 +71,13 @@ namespace http { rep.content = "" + rep.content + ""; */ - boost::property_tree::wptree propertyTree = PrepareReport(requestedStr); - - std::wstringstream output_stream; - - boost::property_tree::write_json(output_stream, propertyTree); - - std::string outputJsonCode = UTF16to8(output_stream.str().c_str()); + boost::property_tree::wptree propertyTree = PrepareReport(requestedStr); + + std::wstringstream output_stream; + + boost::property_tree::write_json(output_stream, propertyTree); + + std::string outputJsonCode = wstring_to_string(output_stream.str()); rep.status = reply::ok; diff --git a/rudict/rudict/noun.cpp b/rudict/rudict/noun.cpp index 38b2ea6..22e012a 100644 --- a/rudict/rudict/noun.cpp +++ b/rudict/rudict/noun.cpp @@ -197,8 +197,8 @@ bool NounIsInDictionary(std::wstring nounNominative) std::cout <c_str()) << std::endl; -std::cout <<"$$" << UTF16to8(nounNominative.c_str()) << std::endl; +std::cout << "$$" << wstring_to_string(*frequentWordSet.begin()) << std::endl; +std::cout << "$$" << wstring_to_string(nounNominative) << std::endl; std::cout << "count" << frequentWordSet.count(nounNominative) << std::endl; @@ -366,9 +366,9 @@ std::wstring RestoreNounByTuple(std::wstring nounBase, NounTuple nounTuple) std::vector RecognizeNoun(std::wstring noun) { -std::cout << "!" << UTF16to8(noun.c_str()) << std::endl; + std::cout << "!" << wstring_to_string(noun) << std::endl; -std::cout << "?" << UTF16to8(frequentWordSet.begin()->c_str()) < result; @@ -384,14 +384,14 @@ std::cout << nounEndingDivisionArr.size() << std::endl; std::vector possibleTupleArr = GetPossibleNounTupleArr(nounEnding); -std::cout << "BASE" << UTF16to8(nounBase.c_str()) << std::endl; + std::cout << "BASE" << wstring_to_string(nounBase) << std::endl; for (auto nounTuple : possibleTupleArr) { std::wstring nounNominative = RestoreNounByTuple(nounBase, nounTuple); -std::cout <<"Nominative" << UTF16to8(nounNominative.c_str()) << std::endl; + std::cout << "Nominative" << wstring_to_string(nounNominative) << std::endl; auto possibleNounDetectionSet = GetPossibleNounDeclencionSet(nounNominative); @@ -435,7 +435,7 @@ std::ifstream f("/home/devuser/workplace/rudict/frequent_words.txt"); std::cout<<"File found!" << std::endl; while (getline(f, line)) { - wline = UTF8to16(line.c_str()); + wline = string_to_wstring(line); frequentWordSet.insert(wline); } f.close(); diff --git a/rudict/rudict/utf8utf16.cpp b/rudict/rudict/utf8utf16.cpp index af4f3b7..3ce2bb7 100644 --- a/rudict/rudict/utf8utf16.cpp +++ b/rudict/rudict/utf8utf16.cpp @@ -1,81 +1,95 @@ -#include "utf8utf16.h" - -#include - +#include "utf8utf16.h" + + +#include +#include +#include + +std::string wstring_to_string(std::wstring in) +{ + /* + std::string out; + unsigned int codepoint = 0; + for (in; *in != 0; ++in) + { + if (*in >= 0xd800 && *in <= 0xdbff) + codepoint = ((*in - 0xd800) << 10) + 0x10000; + else + { + if (*in >= 0xdc00 && *in <= 0xdfff) + codepoint |= *in - 0xdc00; + else + codepoint = *in; + + if (codepoint <= 0x7f) + out.append(1, static_cast(codepoint)); + else if (codepoint <= 0x7ff) + { + out.append(1, static_cast(0xc0 | ((codepoint >> 6) & 0x1f))); + out.append(1, static_cast(0x80 | (codepoint & 0x3f))); + } + else if (codepoint <= 0xffff) + { + out.append(1, static_cast(0xe0 | ((codepoint >> 12) & 0x0f))); + out.append(1, static_cast(0x80 | ((codepoint >> 6) & 0x3f))); + out.append(1, static_cast(0x80 | (codepoint & 0x3f))); + } + else + { + out.append(1, static_cast(0xf0 | ((codepoint >> 18) & 0x07))); + out.append(1, static_cast(0x80 | ((codepoint >> 12) & 0x3f))); + out.append(1, static_cast(0x80 | ((codepoint >> 6) & 0x3f))); + out.append(1, static_cast(0x80 | (codepoint & 0x3f))); + } + codepoint = 0; + } + } + return out;*/ + + std::string out = boost::locale::conv::utf_to_utf(in); + + return out; -std::string UTF16to8(const wchar_t * in) -{ - std::string out; - unsigned int codepoint = 0; - for (in; *in != 0; ++in) - { - if (*in >= 0xd800 && *in <= 0xdbff) - codepoint = ((*in - 0xd800) << 10) + 0x10000; - else - { - if (*in >= 0xdc00 && *in <= 0xdfff) - codepoint |= *in - 0xdc00; - else - codepoint = *in; - - if (codepoint <= 0x7f) - out.append(1, static_cast(codepoint)); - else if (codepoint <= 0x7ff) - { - out.append(1, static_cast(0xc0 | ((codepoint >> 6) & 0x1f))); - out.append(1, static_cast(0x80 | (codepoint & 0x3f))); - } - else if (codepoint <= 0xffff) - { - out.append(1, static_cast(0xe0 | ((codepoint >> 12) & 0x0f))); - out.append(1, static_cast(0x80 | ((codepoint >> 6) & 0x3f))); - out.append(1, static_cast(0x80 | (codepoint & 0x3f))); - } - else - { - out.append(1, static_cast(0xf0 | ((codepoint >> 18) & 0x07))); - out.append(1, static_cast(0x80 | ((codepoint >> 12) & 0x3f))); - out.append(1, static_cast(0x80 | ((codepoint >> 6) & 0x3f))); - out.append(1, static_cast(0x80 | (codepoint & 0x3f))); - } - codepoint = 0; - } - } - return out; } -std::wstring UTF8to16(const char * in) -{ - - std::wstring out; - if (in == NULL) - return out; - - unsigned int codepoint; - while (*in != 0) - { - unsigned char ch = static_cast(*in); - if (ch <= 0x7f) - codepoint = ch; - else if (ch <= 0xbf) - codepoint = (codepoint << 6) | (ch & 0x3f); - else if (ch <= 0xdf) - codepoint = ch & 0x1f; - else if (ch <= 0xef) - codepoint = ch & 0x0f; - else - codepoint = ch & 0x07; - ++in; - if (((*in & 0xc0) != 0x80) && (codepoint <= 0x10ffff)) - { - if (codepoint > 0xffff) - { - out.append(1, static_cast(0xd800 + (codepoint >> 10))); - out.append(1, static_cast(0xdc00 + (codepoint & 0x03ff))); - } - else if (codepoint < 0xd800 || codepoint >= 0xe000) - out.append(1, static_cast(codepoint)); - } - } - return out; -} +std::wstring string_to_wstring(std::string in) +{ + /* + std::wstring out; + if (in == NULL) + return out; + + unsigned int codepoint; + while (*in != 0) + { + unsigned char ch = static_cast(*in); + if (ch <= 0x7f) + codepoint = ch; + else if (ch <= 0xbf) + codepoint = (codepoint << 6) | (ch & 0x3f); + else if (ch <= 0xdf) + codepoint = ch & 0x1f; + else if (ch <= 0xef) + codepoint = ch & 0x0f; + else + codepoint = ch & 0x07; + ++in; + if (((*in & 0xc0) != 0x80) && (codepoint <= 0x10ffff)) + { + if (codepoint > 0xffff) + { + out.append(1, static_cast(0xd800 + (codepoint >> 10))); + out.append(1, static_cast(0xdc00 + (codepoint & 0x03ff))); + } + else if (codepoint < 0xd800 || codepoint >= 0xe000) + out.append(1, static_cast(codepoint)); + } + } + return out; + + */ + + std::wstring out = boost::locale::conv::utf_to_utf(in); + + return out; +} diff --git a/rudict/rudict/utf8utf16.h b/rudict/rudict/utf8utf16.h index ab1e77c..870083e 100644 --- a/rudict/rudict/utf8utf16.h +++ b/rudict/rudict/utf8utf16.h @@ -1,17 +1,18 @@ -#ifndef UTF8UTF16_H_INCLUDED -#define UTF8UTF16_H_INCLUDED - - -#include -#include -#include -#include -#include -#include - -std::wstring UTF8to16(const char * in); -std::string UTF16to8(const wchar_t * in); - - - -#endif //UTF8UTF16_H_INCLUDED +#ifndef UTF8UTF16_H_INCLUDED +#define UTF8UTF16_H_INCLUDED + + +#include +#include +#include +#include +#include +#include + +std::string wstring_to_string(std::wstring in); + +std::wstring string_to_wstring(std::string in); + + + +#endif //UTF8UTF16_H_INCLUDED