還是簡單些好: 穿梭於 UTF-8 與 UTF-16 之間

偶然於 7zip 的 LZMA sdk 裡發現非常簡潔的 UTF-8/UTF-16 變換函數，連一般轉換成 Unicode 的中介動作也省去了。可惜它本身的解壓功能未能滿足遊戲裝載系統的要求，皆因 7zip 的 archive 格式不能以最少的資源去解壓 archive 裡的個別檔案。
以下源始碼引用 LZMA sdk 再加上本人所寫的額外錯誤偵測與註解，enjoy!


typedef byte_t unsigned char;

// Reference: http://en.wikipedia.org/wiki/Utf8
static const byte_t cUtf8Limits[] = {
 0xC0, // Start of a 2-byte sequence
 0xE0, // Start of a 3-byte sequence
 0xF0, // Start of a 4-byte sequence
 0xF8, // Start of a 5-byte sequence
 0xFC, // Start of a 6-byte sequence
 0xFE // Invalid: not defined by original UTF-8 specification
};

/*! Usually it is a 2 steps process to convert the string, invoke utf8ToUtf16() with
 dest equals to null so that it gives you destLen (not including null terminator),
 then allocate the destination with that amount of memory and call utf8ToUtf16() once
 again to perform the actual conversion. You can skip the first call if you sure
 the destination buffer is large enough to store the data.

 \note Here we assum sizeof(wchar_t) == 2
 \ref Modify from 7zip LZMA sdk
 */
bool utf8ToUtf16(wchar_t* dest, size_t& destLen, const char* src, size_t maxSrcLen)
{
 size_t destPos = 0, srcPos = 0;

 while(true)
 {
  byte_t c; // Note that byte_t should be unsigned
  size_t numAdds;

  if(srcPos == maxSrcLen || src[srcPos] == '\0') {
   if(dest && destLen != destPos) {
    assert(false && "The provided destLen should equals to what we calculated here");
    return false;
   }

   destLen = destPos;
   return true;
  }

  c = src[srcPos++];

  if(c < 0x80) { // 0-127, US-ASCII (single byte)
   if(dest)
    dest[destPos] = (wchar_t)c;
   ++destPos;
   continue;
  }

  if(c < 0xC0) // The first octet for each code point should within 0-191
   break;

  for(numAdds = 1; numAdds < 5; ++numAdds)
   if(c < cUtf8Limits[numAdds])
    break;
  uint32_t value = c - cUtf8Limits[numAdds - 1];

  do {
   byte_t c2;
   if(srcPos == maxSrcLen || src[srcPos] == '\0')
    break;
   c2 = src[srcPos++];
   if(c2 < 0x80 || c2 >= 0xC0)
    break;
   value <<= 6;
   value |= (c2 - 0x80);
  } while(--numAdds != 0);

  if(value < 0x10000) {
   if(dest)
    dest[destPos] = (wchar_t)value;
   ++destPos;
  }
  else {
   value -= 0x10000;
   if(value >= 0x100000)
    break;
   if(dest) {
    dest[destPos + 0] = (wchar_t)(0xD800 + (value >> 10));
    dest[destPos + 1] = (wchar_t)(0xDC00 + (value & 0x3FF));
   }
   destPos += 2;
  }
 }

 destLen = destPos;
 return false;
}

bool utf8ToWStr(const char* utf8Str, size_t maxCount, std::wstring& wideStr)
{
 size_t destLen = 0;

 // Get the length of the wide string
 if(!utf8ToUtf16(nullptr, destLen, utf8Str, maxCount))
  return false;

 wideStr.resize(destLen);
 if(wideStr.size() != destLen)
  return false;

 return utf8ToUtf16(const_cast<wchar_t*>(wideStr.c_str()), destLen, utf8Str, maxCount);
}

bool utf8ToWStr(const std::string& utf8Str, std::wstring& wideStr)
{
 return utf8ToWStr(utf8Str.c_str(), utf8Str.size(), wideStr);
}

//! See the documentation for utf8ToUtf16()
bool utf16ToUtf8(char* dest, size_t& destLen, const wchar_t* src, size_t maxSrcLen)
{
 size_t destPos = 0, srcPos = 0;

 while(true)
 {
  uint32_t value;
  size_t numAdds;

  if(srcPos == maxSrcLen || src[srcPos] == L'\0') {
   if(dest && destLen != destPos) {
    assert(false && "The provided destLen should equals to what we calculated here");
    return false;
   }
   destLen = destPos;
   return true;
  }

  value = src[srcPos++];

  if(value < 0x80) { // 0-127, US-ASCII (single byte)
   if(dest)
    dest[destPos] = char(value);
   ++destPos;
   continue;
  }

  if(value >= 0xD800 && value < 0xE000) {
   if(value >= 0xDC00 || srcPos == maxSrcLen)
    break;
   uint32_t c2 = src[srcPos++];
   if(c2 < 0xDC00 || c2 >= 0xE000)
    break;
   value = ((value - 0xD800) << 10) | (c2 - 0xDC00);
  }

  for(numAdds = 1; numAdds < 5; ++numAdds)
   if(value < (uint32_t(1) << (numAdds * 5 + 6)))
    break;

  if(dest)
   dest[destPos] = char(cUtf8Limits[numAdds - 1] + (value >> (6 * numAdds)));
  ++destPos;

  do {
   --numAdds;
   if(dest)
    dest[destPos] = char(0x80 + ((value >> (6 * numAdds)) & 0x3F));
   ++destPos;
  } while(numAdds != 0);
 }

 destLen = destPos;
 return false;
}

bool wStrToUtf8(const wchar_t* wideStr, size_t maxCount, std::string& utf8Str)
{
 size_t destLen = 0;

 // Get the length of the utf-8 string
 if(!utf16ToUtf8(nullptr, destLen, wideStr, maxCount))
  return false;

 utf8Str.resize(destLen);
 if(utf8Str.size() != destLen)
  return false;

 return utf16ToUtf8(const_cast<char*>(utf8Str.c_str()), destLen, wideStr, maxCount);
}

bool wStrToUtf8(const std::wstring& wideStr, std::string& utf8Str)
{
 return wStrToUtf8(wideStr.c_str(), wideStr.size(), utf8Str);
}

還是簡單些好

2008年11月26日星期三

穿梭於 UTF-8 與 UTF-16 之間

沒有留言:

發佈留言

Search this blog

連結

網誌封存

關於我

還是簡單些好

2008年11月26日星期三

穿梭於 UTF-8 與 UTF-16 之間

沒有留言:

發佈留言

Search this blog

連結

訂閱

網誌封存

關於我