utf8 dammit

2014-11-27 09:45:52 +00:00 · 2014-11-27 09:45:52 +00:00 · 7cf1a99d98
commit 7cf1a99d98
parent b1878507f9
4 changed files with 126 additions and 111 deletions
--- a/rudict/rudict/http/request_handler.cpp
+++ b/rudict/rudict/http/request_handler.cpp
@ -17,7 +17,7 @@
 #include "reply.hpp"
 #include "request.hpp"

-#include "boost/algorithm/string.hpp"
+#include "boost/algorithm/string.hpp"
 #include "boost/property_tree/json_parser.hpp"
 #include "../utf8utf16.h"
 #include "../noun.h"
@ -62,7 +62,7 @@ namespace http {
 			boost::to_lower(request_path);

 			
-			std::wstring requestedStr = UTF8to16(request_path.c_str());
+			std::wstring requestedStr = string_to_wstring(request_path);
 			/*
 			requestedStr = L"Вы запросили: " + requestedStr;

@ -71,13 +71,13 @@ namespace http {
 			rep.content = "<html><body>" + rep.content + "</body></html>";
 			*/

-			boost::property_tree::wptree propertyTree = PrepareReport(requestedStr);
-
-			std::wstringstream output_stream;
-
-			boost::property_tree::write_json(output_stream, propertyTree);
-
-			std::string outputJsonCode = UTF16to8(output_stream.str().c_str());
+			boost::property_tree::wptree propertyTree = PrepareReport(requestedStr);
+
+			std::wstringstream output_stream;
+
+			boost::property_tree::write_json(output_stream, propertyTree);
+
+			std::string outputJsonCode = wstring_to_string(output_stream.str());

 			rep.status = reply::ok;

--- a/rudict/rudict/noun.cpp
+++ b/rudict/rudict/noun.cpp
@ -197,8 +197,8 @@ bool NounIsInDictionary(std::wstring nounNominative)

 std::cout <<frequentWordSet.size() << std::endl;

-std::cout << "$$" << UTF16to8(frequentWordSet.begin()->c_str()) << std::endl;
-std::cout <<"$$" << UTF16to8(nounNominative.c_str()) << std::endl;
+std::cout << "$$" << wstring_to_string(*frequentWordSet.begin()) << std::endl;
+std::cout << "$$" << wstring_to_string(nounNominative) << std::endl;

 std::cout << "count" << frequentWordSet.count(nounNominative) << std::endl;

@ -366,9 +366,9 @@ std::wstring RestoreNounByTuple(std::wstring nounBase, NounTuple nounTuple)
 std::vector<NounStruct> RecognizeNoun(std::wstring noun)
 {

-std::cout << "!" << UTF16to8(noun.c_str()) << std::endl;
+	std::cout << "!" << wstring_to_string(noun) << std::endl;

-std::cout << "?" << UTF16to8(frequentWordSet.begin()->c_str()) <<std::endl;
+	std::cout << "?" << wstring_to_string(*frequentWordSet.begin()) << std::endl;


 	std::vector<NounStruct> result;
@ -384,14 +384,14 @@ std::cout << nounEndingDivisionArr.size() << std::endl;

 		std::vector<NounTuple> possibleTupleArr = GetPossibleNounTupleArr(nounEnding);

-std::cout << "BASE" << UTF16to8(nounBase.c_str()) << std::endl;
+		std::cout << "BASE" << wstring_to_string(nounBase) << std::endl;

 	
 		for (auto nounTuple : possibleTupleArr)
 		{
 			std::wstring nounNominative = RestoreNounByTuple(nounBase, nounTuple);

-std::cout <<"Nominative" << UTF16to8(nounNominative.c_str()) << std::endl;
+			std::cout << "Nominative" << wstring_to_string(nounNominative) << std::endl;
 		
 			auto possibleNounDetectionSet = GetPossibleNounDeclencionSet(nounNominative);

@ -435,7 +435,7 @@ std::ifstream f("/home/devuser/workplace/rudict/frequent_words.txt");
 	    std::cout<<"File found!" << std::endl;
 		while (getline(f, line))
 		{
-			wline = UTF8to16(line.c_str());
+			wline = string_to_wstring(line);
 			frequentWordSet.insert(wline);
 		}
 		f.close();
--- a/rudict/rudict/utf8utf16.cpp
+++ b/rudict/rudict/utf8utf16.cpp
@ -1,81 +1,95 @@
-#include "utf8utf16.h"
-
-#include <string>
-
+#include "utf8utf16.h"
+
+
+#include <string>
+#include <boost/locale.hpp>
+#include <locale>
+
+std::string wstring_to_string(std::wstring in)
+{
+	/*
+	std::string out;
+    unsigned int codepoint = 0;
+    for (in;  *in != 0;  ++in)
+    {
+        if (*in >= 0xd800 && *in <= 0xdbff)
+            codepoint = ((*in - 0xd800) << 10) + 0x10000;
+        else
+        {
+            if (*in >= 0xdc00 && *in <= 0xdfff)
+                codepoint |= *in - 0xdc00;
+            else
+                codepoint = *in;
+
+            if (codepoint <= 0x7f)
+                out.append(1, static_cast<char>(codepoint));
+            else if (codepoint <= 0x7ff)
+            {
+                out.append(1, static_cast<char>(0xc0 | ((codepoint >> 6) & 0x1f)));
+                out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
+            }
+            else if (codepoint <= 0xffff)
+            {
+                out.append(1, static_cast<char>(0xe0 | ((codepoint >> 12) & 0x0f)));
+                out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f)));
+                out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
+            }
+            else
+            {
+                out.append(1, static_cast<char>(0xf0 | ((codepoint >> 18) & 0x07)));
+                out.append(1, static_cast<char>(0x80 | ((codepoint >> 12) & 0x3f)));
+                out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f)));
+                out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
+            }
+            codepoint = 0;
+        }
+    }
+    return out;*/
+
+	std::string out = boost::locale::conv::utf_to_utf<char>(in);
+
+	return out;

-std::string UTF16to8(const wchar_t * in)
-{
-	std::string out;
-    unsigned int codepoint = 0;
-    for (in;  *in != 0;  ++in)
-    {
-        if (*in >= 0xd800 && *in <= 0xdbff)
-            codepoint = ((*in - 0xd800) << 10) + 0x10000;
-        else
-        {
-            if (*in >= 0xdc00 && *in <= 0xdfff)
-                codepoint |= *in - 0xdc00;
-            else
-                codepoint = *in;
-
-            if (codepoint <= 0x7f)
-                out.append(1, static_cast<char>(codepoint));
-            else if (codepoint <= 0x7ff)
-            {
-                out.append(1, static_cast<char>(0xc0 | ((codepoint >> 6) & 0x1f)));
-                out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
-            }
-            else if (codepoint <= 0xffff)
-            {
-                out.append(1, static_cast<char>(0xe0 | ((codepoint >> 12) & 0x0f)));
-                out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f)));
-                out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
-            }
-            else
-            {
-                out.append(1, static_cast<char>(0xf0 | ((codepoint >> 18) & 0x07)));
-                out.append(1, static_cast<char>(0x80 | ((codepoint >> 12) & 0x3f)));
-                out.append(1, static_cast<char>(0x80 | ((codepoint >> 6) & 0x3f)));
-                out.append(1, static_cast<char>(0x80 | (codepoint & 0x3f)));
-            }
-            codepoint = 0;
-        }
-    }
-    return out;
 }

-std::wstring UTF8to16(const char * in)
-{
-
-	std::wstring out;
-	if (in == NULL)
-		return out;
-
-	unsigned int codepoint;
-	while (*in != 0)
-	{
-		unsigned char ch = static_cast<unsigned char>(*in);
-		if (ch <= 0x7f)
-			codepoint = ch;
-		else if (ch <= 0xbf)
-			codepoint = (codepoint << 6) | (ch & 0x3f);
-		else if (ch <= 0xdf)
-			codepoint = ch & 0x1f;
-		else if (ch <= 0xef)
-			codepoint = ch & 0x0f;
-		else
-			codepoint = ch & 0x07;
-		++in;
-		if (((*in & 0xc0) != 0x80) && (codepoint <= 0x10ffff))
-		{
-			if (codepoint > 0xffff)
-			{
-				out.append(1, static_cast<wchar_t>(0xd800 + (codepoint >> 10)));
-				out.append(1, static_cast<wchar_t>(0xdc00 + (codepoint & 0x03ff)));
-			}
-			else if (codepoint < 0xd800 || codepoint >= 0xe000)
-				out.append(1, static_cast<wchar_t>(codepoint));
-		}
-	}
-	return out;
-}
+std::wstring string_to_wstring(std::string in)
+{
+	/*
+	std::wstring out;
+	if (in == NULL)
+		return out;
+
+	unsigned int codepoint;
+	while (*in != 0)
+	{
+		unsigned char ch = static_cast<unsigned char>(*in);
+		if (ch <= 0x7f)
+			codepoint = ch;
+		else if (ch <= 0xbf)
+			codepoint = (codepoint << 6) | (ch & 0x3f);
+		else if (ch <= 0xdf)
+			codepoint = ch & 0x1f;
+		else if (ch <= 0xef)
+			codepoint = ch & 0x0f;
+		else
+			codepoint = ch & 0x07;
+		++in;
+		if (((*in & 0xc0) != 0x80) && (codepoint <= 0x10ffff))
+		{
+			if (codepoint > 0xffff)
+			{
+				out.append(1, static_cast<wchar_t>(0xd800 + (codepoint >> 10)));
+				out.append(1, static_cast<wchar_t>(0xdc00 + (codepoint & 0x03ff)));
+			}
+			else if (codepoint < 0xd800 || codepoint >= 0xe000)
+				out.append(1, static_cast<wchar_t>(codepoint));
+		}
+	}
+	return out;
+
+	*/
+
+	std::wstring out = boost::locale::conv::utf_to_utf<wchar_t>(in);
+
+	return out;
+}
--- a/rudict/rudict/utf8utf16.h
+++ b/rudict/rudict/utf8utf16.h
@ -1,17 +1,18 @@
-#ifndef UTF8UTF16_H_INCLUDED
-#define UTF8UTF16_H_INCLUDED
-
-
-#include <iostream>
-#include <stdexcept>
-#include <vector>
-#include <string>
-#include <cstring>
-#include <locale>
-
-std::wstring UTF8to16(const char * in);
-std::string UTF16to8(const wchar_t * in);
-
-
-
-#endif //UTF8UTF16_H_INCLUDED
+#ifndef UTF8UTF16_H_INCLUDED
+#define UTF8UTF16_H_INCLUDED
+
+
+#include <iostream>
+#include <stdexcept>
+#include <vector>
+#include <string>
+#include <cstring>
+#include <locale>
+
+std::string wstring_to_string(std::wstring in);
+
+std::wstring string_to_wstring(std::string in);
+
+
+
+#endif //UTF8UTF16_H_INCLUDED