C++ UTF8和UTF16互轉代碼

阿新 • • 發佈：2017-07-06

define iter 般的 != ont for efault 互轉小端

簡介

1、這段代碼只考慮在小端序情況下的轉換（一般的機器都是的）。
2、這段代碼需要C++11的支持(只是用到了u16string)，如果不支持，可以添加下面代碼

typedef uint16_t char16_t;
typedef std::basic_string<char16_t>

utfconvert.h

#ifndef __UTFCONVERT_H__
#define __UTFCONVERT_H__
#include <string>


// 從UTF16編碼字符串構建，需要帶BOM標記
std::string utf16_to_utf8(const std::u16string& u16str);

// 從UTF16 LE編碼的字符串創建
std::string utf16le_to_utf8(const std::u16string& u16str);

// 從UTF16BE編碼字符串創建
std::string utf16be_to_utf8(const std::u16string& u16str);

// 獲取轉換為UTF-16 LE編碼的字符串
std::u16string utf8_to_utf16le(const std::string& u8str, bool addbom = false, bool* ok = NULL);

// 獲取轉換為UTF-16 BE的字符串
std::u16string utf8_to_utf16be(const std::string& u8str, bool addbom = false, bool* ok = NULL);

#endif //! __UTFCONVERT_H__

utfconvert.cpp

#include "utfconvert.h"

#include <stdint.h>
#ifdef __GNUC__
#include <endian.h>
#endif // __GNUC__

static inline uint16_t byteswap_ushort(uint16_t number)
{
#if defined(_MSC_VER) && _MSC_VER > 1310
    return _byteswap_ushort(number);
#elif defined(__GNUC__)
    return __builtin_bswap16(number);
#else
    return (number >> 8) | (number << 8);
#endif
}


////////////////////////////////////////
//     以下轉換都是在小端序下進行     //
////////////////////////////////////////

// 從UTF16編碼字符串構建，需要帶BOM標記
std::string utf16_to_utf8(const std::u16string& u16str)
{
    if (u16str.empty()){ return std::string(); }
    //Byte Order Mark
    char16_t bom = u16str[0];
    switch (bom){
    case 0xFEFF:    //Little Endian
        return utf16le_to_utf8(u16str);
        break;
    case 0xFFFE:    //Big Endian
        return utf16be_to_utf8(u16str);
        break;
    default:
        return std::string();
    }
}


// 從UTF16 LE編碼的字符串創建
std::string utf16le_to_utf8(const std::u16string& u16str)
{
    if (u16str.empty()){ return std::string(); }
    const char16_t* p = u16str.data();
    std::u16string::size_type len = u16str.length();
    if (p[0] == 0xFEFF){
        p += 1; //帶有bom標記，後移
        len -= 1;
    }

    // 開始轉換
    std::string u8str;
    u8str.reserve(len * 3);

    char16_t u16char;
    for (std::u16string::size_type i = 0; i < len; ++i){
        // 這裏假設是在小端序下(大端序不適用)
        u16char = p[i];
        
        // 1字節表示部分
        if (u16char < 0x0080){
            // u16char <= 0x007f
            // U- 0000 0000 ~ 0000 07ff : 0xxx xxxx
            u8str.push_back((char)(u16char & 0x00FF));  // 取低8bit
            continue;
        }
        // 2 字節能表示部分
        if (u16char >= 0x0080 && u16char <= 0x07FF){
            // * U-00000080 - U-000007FF:  110xxxxx 10xxxxxx
            u8str.push_back((char)(((u16char >> 6) & 0x1F) | 0xC0));
            u8str.push_back((char)((u16char & 0x3F) | 0x80));
            continue;
        }
        // 代理項對部分(4字節表示)
        if (u16char >= 0xD800 && u16char <= 0xDBFF) {
            // * U-00010000 - U-001FFFFF: 1111 0xxx 10xxxxxx 10xxxxxx 10xxxxxx
            uint32_t highSur = u16char;
            uint32_t lowSur = p[++i];
            // 從代理項對到UNICODE代碼點轉換
            // 1、從高代理項減去0xD800，獲取有效10bit
            // 2、從低代理項減去0xDC00，獲取有效10bit
            // 3、加上0x10000，獲取UNICODE代碼點值
            uint32_t codePoint = highSur - 0xD800;
            codePoint <<= 10;
            codePoint |= lowSur - 0xDC00;
            codePoint += 0x10000;
            // 轉為4字節UTF8編碼表示
            u8str.push_back((char)((codePoint >> 18) | 0xF0));
            u8str.push_back((char)(((codePoint >> 12) & 0x3F) | 0x80));
            u8str.push_back((char)(((codePoint >> 06) & 0x3F) | 0x80));
            u8str.push_back((char)((codePoint & 0x3F) | 0x80));
            continue;
        }
        // 3 字節表示部分
        {
            // * U-0000E000 - U-0000FFFF:  1110xxxx 10xxxxxx 10xxxxxx
            u8str.push_back((char)(((u16char >> 12) & 0x0F) | 0xE0));
            u8str.push_back((char)(((u16char >> 6) & 0x3F) | 0x80));
            u8str.push_back((char)((u16char & 0x3F) | 0x80));
            continue;
        }
    }
    
    return u8str;
}


// 從UTF16BE編碼字符串創建
std::string utf16be_to_utf8(const std::u16string& u16str)
{
    if (u16str.empty()){ return std::string(); }
    const char16_t* p = u16str.data();
    std::u16string::size_type len = u16str.length();
    if (p[0] == 0xFEFF){
        p += 1; //帶有bom標記，後移
        len -= 1;
    }


    // 開始轉換
    std::string u8str;
    u8str.reserve(len * 2);
    char16_t u16char;   //u16le 低字節存低位，高字節存高位
    for (std::u16string::size_type i = 0; i < len; ++i) {
        // 這裏假設是在小端序下(大端序不適用)
        u16char = p[i];
        // 將大端序轉為小端序
        u16char = byteswap_ushort(u16char);

        // 1字節表示部分
        if (u16char < 0x0080) {
            // u16char <= 0x007f
            // U- 0000 0000 ~ 0000 07ff : 0xxx xxxx
            u8str.push_back((char)(u16char & 0x00FF));
            continue;
        }
        // 2 字節能表示部分
        if (u16char >= 0x0080 && u16char <= 0x07FF) {
            // * U-00000080 - U-000007FF:  110xxxxx 10xxxxxx
            u8str.push_back((char)(((u16char >> 6) & 0x1F) | 0xC0));
            u8str.push_back((char)((u16char & 0x3F) | 0x80));
            continue;
        }
        // 代理項對部分(4字節表示)
        if (u16char >= 0xD800 && u16char <= 0xDBFF) {
            // * U-00010000 - U-001FFFFF: 1111 0xxx 10xxxxxx 10xxxxxx 10xxxxxx
            uint32_t highSur = u16char;
            uint32_t lowSur = byteswap_ushort(p[++i]);
            // 從代理項對到UNICODE代碼點轉換
            // 1、從高代理項減去0xD800，獲取有效10bit
            // 2、從低代理項減去0xDC00，獲取有效10bit
            // 3、加上0x10000，獲取UNICODE代碼點值
            uint32_t codePoint = highSur - 0xD800;
            codePoint <<= 10;
            codePoint |= lowSur - 0xDC00;
            codePoint += 0x10000;
            // 轉為4字節UTF8編碼表示
            u8str.push_back((char)((codePoint >> 18) | 0xF0));
            u8str.push_back((char)(((codePoint >> 12) & 0x3F) | 0x80));
            u8str.push_back((char)(((codePoint >> 06) & 0x3F) | 0x80));
            u8str.push_back((char)((codePoint & 0x3F) | 0x80));
            continue;
        }
        // 3 字節表示部分
        {
            // * U-0000E000 - U-0000FFFF:  1110xxxx 10xxxxxx 10xxxxxx
            u8str.push_back((char)(((u16char >> 12) & 0x0F) | 0xE0));
            u8str.push_back((char)(((u16char >> 6) & 0x3F) | 0x80));
            u8str.push_back((char)((u16char & 0x3F) | 0x80));
            continue;
        }
    }
    return u8str;
}






// 獲取轉換為UTF-16 LE編碼的字符串
std::u16string utf8_to_utf16le(const std::string& u8str, bool addbom, bool* ok)
{
    std::u16string u16str;
    u16str.reserve(u8str.size());
    if (addbom) {
        u16str.push_back(0xFEFF);   //bom (字節表示為 FF FE)
    }
    std::string::size_type len = u8str.length();

    const unsigned char* p = (unsigned char*)(u8str.data());
    // 判斷是否具有BOM(判斷長度小於3字節的情況)
    if (len > 3 && p[0] == 0xEF && p[1] == 0xBB && p[2] == 0xBF){
        p += 3;
        len -= 3;
    }

    bool is_ok = true;
    // 開始轉換
    for (std::string::size_type i = 0; i < len; ++i) {
        uint32_t ch = p[i]; // 取出UTF8序列首字節
        if ((ch & 0x80) == 0) {
            // 最高位為0，只有1字節表示UNICODE代碼點
            u16str.push_back((char16_t)ch);
            continue;
        }
        switch (ch & 0xF0)
        {
        case 0xF0: // 4 字節字符, 0x10000 到 0x10FFFF
        {
            uint32_t c2 = p[++i];
             
uint32_t c3 = p[++i];
            uint32_t c4 = p[++i];
            // 計算UNICODE代碼點值(第一個字節取低3bit，其余取6bit)
            uint32_t codePoint = ((ch & 0x07U) << 18) | ((c2 & 0x3FU) << 12) | ((c3 & 0x3FU) << 6) | (c4 & 0x3FU);
            if (codePoint >= 0x10000)
            {
                 
// 在UTF-16中 U+10000 到 U+10FFFF 用兩個16bit單元表示, 代理項對.
                // 1、將代碼點減去0x10000(得到長度為20bit的值)
                // 2、high 代理項 是將那20bit中的高10bit加上0xD800(110110 00 00000000)
                // 3、low  代理項 是將那20bit中的低10bit加上0xDC00(110111 00 00000000)
                codePoint -= 0x10000;
                u16str.push_back((char16_t)((codePoint >> 10) | 0xD800U));
                u16str.push_back((char16_t)((codePoint & 0x03FFU) | 0xDC00U));
            }
            else
            {
                // 在UTF-16中 U+0000 到 U+D7FF 以及 U+E000 到 U+FFFF 與Unicode代碼點值相同.
                // U+D800 到 U+DFFF 是無效字符, 為了簡單起見，這裏假設它不存在(如果有則不編碼)
                u16str.push_back((char16_t)codePoint);
            }
        }
        break;
        case 0xE0: // 3 字節字符, 0x800 到 0xFFFF
        {
            uint32_t c2 = p[++i];
            uint32_t c3 = p[++i];
            // 計算UNICODE代碼點值(第一個字節取低4bit，其余取6bit)
            uint32_t codePoint = ((ch & 0x0FU) << 12) | ((c2 & 0x3FU) << 6) | (c3 & 0x3FU);
            u16str.push_back((char16_t)codePoint);
        }
        break;
        case 0xD0: // 2 字節字符, 0x80 到 0x7FF
        case 0xC0:
        {
            uint32_t c2 = p[++i];
            // 計算UNICODE代碼點值(第一個字節取低5bit，其余取6bit)
            uint32_t codePoint = ((ch & 0x1FU) << 12) | ((c2 & 0x3FU) << 6);
            u16str.push_back((char16_t)codePoint);
        }
        break;
        default:    // 單字節部分(前面已經處理，所以不應該進來)
            is_ok = false;
            break;
        }
    }
    if (ok != NULL) { *ok = is_ok; }

    return u16str;
}


// 獲取轉換為UTF-16 BE的字符串
std::u16string utf8_to_utf16be(conststd::string& u8str, bool addbom, bool* ok)
{
    // 先獲取utf16le編碼字符串
    std::u16string u16str = utf8_to_utf16le(u8str, addbom, ok);
    // 將小端序轉換為大端序
    for (size_t i = 0; i < u16str.size(); ++i) {
        u16str[i] = byteswap_ushort(u16str[i]);
    }
    return u16str;
}

http://www.cnblogs.com/oloroso/p/6801076.html

C++ UTF8和UTF16互轉代碼

define iter 般的 != ont for efault 互轉小端簡介 1、這段代碼只考慮在小端序情況下的轉換（一般的機器都是的）。2、這段代碼需要C++11的支持(只是用到了u16string)，如果不支持，可以添加下面代碼 typedef uin

C int和BYTE互轉、字串轉十六進位制位元組陣列

//int 轉 4位元組 BYTE[], void intToByte(int i,BYTE abyte[]) { abyte[3] = (byte)(0xff & i); abyte[2] = (byte)((0xff00 & i) >>

utf8和ucs2互轉

UFT8和UCS2互轉效API使用進行拆分字數如圖 /* * 函式： * utf8_to_ucs2（utf8轉ucs2） * 引數： * utf8：utf8資料 * utf8_lenght：utf8資料長度 * ucs2：儲存ucs2資料空間

C++構造函數和編譯器自動生成代碼的陷阱

log bug () 很好 style 自動 pub 為我 ret 最近在項目中debug各種access violation的，其中這個問題比較有代表性，並且能夠被規範的代碼標準解決。問題可以總結為以下的代碼： 1 class TestString 2 { 3

c#的托管代碼和非托管代碼的理解

ont 線程管理安全本機有一個自己垃圾相關 spa 理解托管和非托管代碼的前提之下，要先了解CLR(公共語言運行庫) .Net Framework 是由彼此獨立又相關的兩部分組成:CLR 和類庫, CLR是它為我們提供的服務，類庫是它實現的功能. .NET

【轉】托管代碼和非托管代碼的區別

產生沒有匯編代碼 cati 一行包含虛擬機被調用庫類什麽是托管代碼(managed code)？托管代碼是一microsoft的中間語言(IL)，他主要的作用是在.NET FRAMEWORK的公共語言運行庫(CLR)執行代碼前去編譯

C# 位元組陣列和字串互轉

本章講述：部分資料型別，格式轉換（十六進位制字串和位元組陣列互轉位元組陣列和字串互轉） public class HexConverter { #region 格式轉換 /// <summary> /// 轉換十六進位制

C#同步網絡時間和本地時間的代碼

lac linq generic ray [1] 常用 seq update etime 做工程過程，將做工程過程經常用的內容做個收藏，下面內容內容是關於 C#同步網絡時間和本地時間的內容，希望對各位朋友有較大用途。 using System; using System.

C#與unity中base64string和圖片互轉

C#: using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; using System.Drawing; using

用C#呼叫Windows API和其它程序通訊及C# 獲得另外一程式控制代碼後控制該程式控制其透明窗體大小

作者：邊城浪子◎2005-01-16 關鍵字： C#，API，FindWindow，FindWindowEx，SendMessage，程序，登錄檔設計初衷：公司為了便於網路管理，使用了IEEE 802.1X的網路訪問控制，這樣每次開機需要輸入兩次登入密碼，於是我就研究了

C中字串UTF-8和GB2312互轉

在程式設計過程中需要對字串進行不同的轉換，特別是Gb2312和Utf-8直接的轉換。在幾個開源的魔獸私服中，很多都是老外開發的，而暴雪為了能夠相容世界上的各個字符集也使用了UTF-8。在中國使用VS（VS2005以上版本）開發基本都是使用Gb2312的Unicode字符

轉：更新最新程式碼到本地、和推送原生代碼到遠端程式碼庫

一.更新最新程式碼到本地 1.檢視遠端分支使用如下命令可以檢視遠端倉庫（我這裡有一個origin倉庫） $ git remote -v git remote命令會列出每個遠端庫的簡短名字，在克隆完某個專案後，至少可以看到一個名為 origin 的遠端庫，gi

C#控制Windows系統關機、重啟和註銷的代碼

rgs res nsh gof oid win shu eve nta 如下資料是關於C#控制Windows系統關機、重啟和註銷的代碼。 protected void btnShutDown_Click(object sender, EventArgs e)

Objective-C 列舉型別和字串互轉方案

列舉型別的使用優勢沒啥好說的。但經常需要將它與字串進行轉換方便顯示或其它需求。在常見的方案中，多是先宣告，然後在實現檔案裡建立一個對映表。這樣帶來的問題就是要分兩個檔案來管理，帶來管理不方便不好。使用前還在確保對映表已經初始化。顯然使用時還是有些許難受。如果全部改用巨集

c# 無損高質量壓縮圖片代碼

++ osi name source 新路 public rip erp lan 最近，項目上涉及到了圖像壓縮，發現原有的圖像壓縮功能，雖然保證了圖像的大小300K以內，但是壓縮後的圖像看的不在清晰，並且，限定了圖片的Height或者是Width。在CSDN上看到了一個

wchar_t* 和char* 互轉

multi pan com nic return compose ide acp 轉化 //將單字節char*轉化為寬字節wchar_t* wchar_t* AnsiToUnicode(const char* szStr){ int nLen = MultiByte

用Html5/CSS3做Winform，一步一步教你搭建CefSharp開發環境（附JavaScript異步調用C#例子，及全部源代碼）上

轉載界面設計右鍵異步一個由於編寫 scrip 調用本文為雞毛巾原創，原文地址：http://www.cnblogs.com/jimaojin/p/7077131.html，轉載請註明 CefSharp說白了就是Chromium瀏覽器的嵌入式核心，我們用此開發W

html5中制作loading圖標和圖片預覽代碼詳解

eight -c html5 圖片 nec lin lib jpg truct zh-cn html5制作loading圖的示例代碼如下: <!DOCTYPE html><html><head><title><

C# 圖片與Base64互轉

name oba mar base64 richtext sender bin binary ram /// <summary> /// 將圖片數據轉換為Base64字符串 /// </summary> /// <pa

通過遊戲學python 3.6 第一季第三章實例項目猜數字遊戲--核心代碼--猜測次數--隨機函數和屏蔽錯誤代碼--優化代碼及註釋可復制直接使用娛樂可封裝函數

nbsp 退出而不是判斷 and 封裝 except 次數 img 1 #猜數字--核心代碼--猜測次數--隨機函數和屏蔽錯誤代碼---優化代碼及註釋 2 3 import random 4 number = random.randint(1,

C++ UTF8和UTF16互轉代碼

簡介

utfconvert.h

utfconvert.cpp

相關推薦