Tutorial :How to read file which contains \uxxxx in vc++



Question:

I have txt file whose contents are:

\u041f\u0435\u0440\u0432\u044b\u0439_\u0438\u043d\u0442\u0435\u0440\u0430\u043a\u0442\u0438\u0432\u043d\u044b\u0439_\u0438\u043d\u0442\u0435\u0440\u043d\u0435\u0442_\u043a\u0430\u043d\u0430\u043b

How can I read such file to get result like this:

"Первый_интерактивный_интернет_канал"

If I type this:

string str = _T("\u041f\u0435\u0440\u0432\u044b\u0439_\u0438\u043d\u0442\u0435\u0440\u0430\u043a\u0442\u0438\u0432\u043d\u044b\u0439_\u0438\u043d\u0442\u0435\u0440\u043d\u0435\u0442_\u043a\u0430\u043d\u0430\u043b");  

then result in str is good but if I read it from file then it is the same like in file. I guess it is because '\u' becomes '\u'. Is there simple way to convert \uxxxx notation to corresponding symbols in C++?


Solution:1

Here is an example for MSalters's suggestion:

#include <iostream>  #include <string>  #include <fstream>  #include <algorithm>  #include <sstream>  #include <iomanip>  #include <locale>    #include <boost/scoped_array.hpp>  #include <boost/regex.hpp>  #include <boost/numeric/conversion/cast.hpp>    std::wstring convert_unicode_escape_sequences(const std::string& source) {    const boost::regex regex("\\\\u([0-9A-Fa-f]{4})");  // NB: no support for non-BMP characters    boost::scoped_array<wchar_t> buffer(new wchar_t[source.size()]);    wchar_t* const output_begin = buffer.get();    wchar_t* output_iter = output_begin;    std::string::const_iterator last_match = source.begin();    for (boost::sregex_iterator input_iter(source.begin(), source.end(), regex), input_end; input_iter != input_end; ++input_iter) {      const boost::smatch& match = *input_iter;      output_iter = std::copy(match.prefix().first, match.prefix().second, output_iter);      std::stringstream stream;      stream << std::hex << match[1].str() << std::ends;      unsigned int value;      stream >> value;      *output_iter++ = boost::numeric_cast<wchar_t>(value);      last_match = match[0].second;    }    output_iter = std::copy(last_match, source.end(), output_iter);    return std::wstring(output_begin, output_iter);  }    int wmain() {    std::locale::global(std::locale(""));    const std::wstring filename = L"test.txt";    std::ifstream stream(filename.c_str(), std::ios::in | std::ios::binary);    stream.seekg(0, std::ios::end);    const std::ifstream::streampos size = stream.tellg();    stream.seekg(0);    boost::scoped_array<char> buffer(new char[size]);    stream.read(buffer.get(), size);    const std::string source(buffer.get(), size);    const std::wstring result = convert_unicode_escape_sequences(source);    std::wcout << result << std::endl;  }  

I'm always surprised how complicated seemingly simple things like this are in C++.


Solution:2

It's not very easy when you're reading in the file. It's easier to do a post-processing step afterwards. You can use Boost::regex to look for the pattern "\u[0-9A-Fa-f]{4}", and replace that by the corresponding single character.


Solution:3

My solution. I used Boost for UTF-16 - UTF-8 conversion.

#include <fstream>  #include <codecvt>  #include <boost/numeric/conversion/cast.hpp>    //------------------------------------------------------------------------------    inline uint8_t get_uint8(uint8_t h, uint8_t l)  {      uint8_t ret;        if (h - '0' < 10)          ret = h - '0';      else if (h - 'A' < 6)          ret = h - 'A' + 0x0A;      else if (h - 'a' < 6)          ret = h - 'a' + 0x0A;        ret = ret << 4;        if (l - '0' < 10)          ret |= l - '0';      else if (l - 'A' < 6)          ret |= l - 'A' + 0x0A;      else if (l - 'a' < 6)          ret |= l - 'a' + 0x0A;      return  ret;  }    std::string convert_unicode_escape_sequences(const std::string& source)   {      std::wstring ws; ws.reserve(source.size());      std::wstringstream wis(ws);        auto s = source.begin();      while (s != source.end())      {          if (*s == '\\')          {              if (std::distance(s, source.end()) > 5)              {                  if (*(s + 1) == 'u')                  {                      unsigned int v = get_uint8(*(s + 2), *(s + 3)) << 8;                      v |= get_uint8(*(s + 4), *(s + 5));                        s += 6;                      wis << boost::numeric_cast<wchar_t>(v);                      continue;                  }              }          }          wis << wchar_t(*s);          s++;      }        std::wstring_convert<std::codecvt_utf8<wchar_t>> myconv;      return myconv.to_bytes(wis.str());  }  


Solution:4

Check this code :) Windows SDK has it already for you, MS geeks thought for this too, you can find more details in this post: http://weblogs.asp.net/kennykerr/archive/2008/07/24/visual-c-in-short-converting-between-unicode-and-utf-8.aspx

#include <atlconv.h>  #include <atlstr.h>    #define ASSERT ATLASSERT    int main()  {      const CStringW unicode1 = L"\u041f and \x03A9"; // 'Alpha' and 'Omega'        const CStringA utf8 = CW2A(unicode1, CP_UTF8);        ASSERT(utf8.GetLength() > unicode1.GetLength());        const CStringW unicode2 = CA2W(utf8, CP_UTF8);        ASSERT(unicode1 == unicode2);           return 0;  }  

This code has been tested by me and it works fine.


Note:If u also have question or solution just comment us below or mail us on toontricks1994@gmail.com
Previous
Next Post »