utf8與unicode轉換

阿新 • • 發佈：2018-12-21

int utf8_to_unicode(uint8_t *in, uint16_t **out, int *outsize)
{
    uint8_t *p = in;
    uint16_t *result = NULL;
    int resultsize = 0;
    uint8_t *tmp = NULL;
    
    result = (uint16_t *)malloc(strlen((char *)in) * 2 + 2); /* should be enough */
    memset(result, 0, strlen((char*)in) * 2 + 2);
    tmp = (uint8_t *)result;
    
    while (*p)
    {
        if (*p >= 0x00 && *p <= 0x7f)
        {
            *tmp = *p;
            tmp++;
            *tmp = '\0';
            resultsize += 1;
        }
        else if ((*p & (0xff << 5)) == 0xc0)
        {
            uint8_t t1 = 0;
            uint8_t t2 = 0;
            
            t1 = *p & (0xff >> 3);
            p++;
            t2 = *p & (0xff >> 2);
            
            *tmp = t2 | ((t1 & (0xff >> 6)) << 6);//t1 >> 2;
            tmp++;
            
            *tmp = t1 >> 2;//t2 | ((t1 & (0xff >> 6)) << 6);
            tmp++;
            
            resultsize += 1;
        }
        else if ((*p & (0xff << 4)) == 0xe0)
        {
            uint8_t t1 = 0;
            uint8_t t2 = 0;
            uint8_t t3 = 0;
            
            t1 = *p & (0xff >> 3);
            p++;
            t2 = *p & (0xff >> 2);
            p++;
            t3 = *p & (0xff >> 2);
            
            //Little Endian
            *tmp = ((t2 & (0xff >> 6)) << 6) | t3;//(t1 << 4) | (t2 >> 2);
            tmp++;
            
            *tmp = (t1 << 4) | (t2 >> 2);//((t2 & (0xff >> 6)) << 6) | t3;
            tmp++;
            resultsize += 1;
        }
        
        p++;
    }
    
    return resultsize;
}

int unicode_to_utf8(uint16_t *in, int insize, uint8_t **out)
{
    int i = 0;
    int outsize = 0;
    int charscount = 0;
    uint8_t *result = NULL;
    uint8_t *tmp = NULL;
    
    charscount = insize / sizeof(uint16_t);
    result = (uint8_t *)malloc(charscount * 3 + 1);
    memset(result, 0, charscount * 3 + 1);
    tmp = result;
    
    for (i = 0; i < charscount; i++)
    {
        uint16_t unicode = in[i];
        
        if (unicode >= 0x0000 && unicode <= 0x007f)
        {
            *tmp = (uint8_t)unicode;
            tmp += 1;
            outsize += 1;
        }
        else if (unicode >= 0x0080 && unicode <= 0x07ff)
        {
            *tmp = 0xc0 | (unicode >> 6);
            tmp += 1;
            *tmp = 0x80 | (unicode & (0xff >> 2));
            tmp += 1;
            outsize += 2;
        }
        else if (unicode >= 0x0800 && unicode <= 0xffff)
        {
            *tmp = 0xe0 | (unicode >> 12);
            tmp += 1;
            *tmp = 0x80 | (unicode >> 6 & 0x00ff);
            tmp += 1;
            *tmp = 0x80 | (unicode & (0xff >> 2));
            tmp += 1;
            outsize += 3;
        }
        
    }
    
    *tmp = '\0';
    *out = result;
    return outsize;
}

utf8與unicode轉換

int utf8_to_unicode(uint8_t *in, uint16_t **out, int *outsize) { uint8_t *p = in; uint16_t *result = NULL; int resultsize = 0; uint8

GBK與unicode轉換

最近寫unicode分詞時，需要處理GBK輸入，用到GBK與Unicode之間的轉換轉自http://www.latelee.org/programming-under-linux/gbk-to-unicode-table.html ----------------

python的string與Unicode轉換，gbk字串編碼

問題一字串前面少了u。當遇見以下情況。返回字串為’\u82f9\u679c’的unicode時候。 str1 = '\u82f9\u679c' # 這裡沒有u，當傳入引數

特殊符號與 unicode 轉換

以下內容全部是其他地方的 copy，以備用。中文符號轉換成 unicode /** * 中文轉換成 unicode * @author fanhui * 2007-3-15

JavaScript 字元與Unicode 轉換漢字轉Unicode碼

Unicode 是電腦科學領域裡的一項業界標準，JavaScript本身就是使用Unicode字符集編寫的，有時候我們需要對一段文字或者一段內容進行重新排版編譯的時候就需要將獲取的值進行轉碼，做個隨筆記錄一下，程式碼： <

Lazarus 1.8 Unicode 字元與UTF8字元的轉換

Lazarus 1.8 Unicode 字元與UTF8字元的轉換我有一個文字檔案，在WINDOWS 7 X64簡體中文版中儲存格式是UCS2,我想用LAZARUS 1.8 寫一工具進行格式化，發現用MEMO1匯入後顯示亂碼，經查，是LAZARUS 1.8 預設用UTF8處理

js字符串與Unicode編碼互相轉換

char 字符串 name targe ron strong target nbsp log ‘好‘.charCodeAt(0).toString(16) "597d" 這段代碼的意思是，把字符‘好‘轉化成Unicode編碼，看看charCodeAt()是怎麽個意思

（轉載）GBK、UTF8、UNICODE編碼轉換

url multi end lsm too vcs BE else bsp GBK、UTF8、UNICODE編碼轉換 1 string GBKToUTF8(const std::string& strGBK) 2 { 3 int nLen = Mu

Java中將字符串與unicode的相互轉換工具類

png style nic ava param 字符串拼接 size tostring info unicode編碼規則 unicode碼對每一個字符用4位16進制數表示。具體規則是：將一個字符(char)的高8位與低8位分別取出，轉化為16進制數，如果轉化的16進制數的長

JAVA方法字串與unicode的相互轉換

分享一下我老師大神的人工智慧教程！零基礎，通俗易懂！http://blog.csdn.net/jiangjunshow 也歡迎大家轉載本篇文章。分享知識，造福人民，實現我們中華民族偉大復興！

Unicode下的CString與char *轉換

轉載：http://blog.sina.com.cn/s/blog_63106cd80100yq8n.html 在VS2005及以上的環境中，所見工程的預設字符集形式是Unicode,而VC6.0中，字符集形式為多位元組字符集(MBCS: Multi-Byte Character Set),

C/C++中ASCII與Unicode字串相互轉換

轉載地址:https://blog.csdn.net/wbq2018/article/details/8806431 1、ASCII to Unicode 函式: wcstombs(VC6)、wcstombs_s 例項: //crt_wcstombs_s.c //This examp

【MFC】Unicode環境下CString與char*轉換

CString轉const char* const char* CStr2C(CString cstring) { return CW2A(cstring.GetString()); } char*轉CString CString C2CStr(char* cstr) { ret

MFC UTF-8與Unicode編碼轉換

環境適用範圍：MFC。字符集：多位元組字符集。宣告 CString UTF8AndUnicode_Convert(CString &strSource, UINT nSourceCodePage, UINT nTargetC

Unicode與中文轉換工具類方法（轉）

/* * 中文轉unicode編碼 */ public static String gbEncoding(final String gbString) { char[] utfBytes = gbString.toCharArray(); St

C++中漢字與unicode碼的轉換

已知一個漢字的unicode碼（例如－２０３１９），可以通過以下方法得到該unicode碼對應的漢字：１，宣告一個３位元組長的字元陣列，例如char a[3]；２，對該陣列賦值為char a[3]={unsigned(-2

Unicode字符集下CString與char *轉換

在Visual C++.NET2005中，預設的字符集形式是Unicode，但在VC6.0等工程中，預設的字符集形式是多位元組字符集（MBCS：Multi-Byte Character Set），這樣導致在VC6.0中非常簡單實用的各類字元操作和函式在VS2005環境下執行

CString與char*轉換（Unicode和多位元組字符集）

一、使用多位元組字符集 1.CString轉char* （1）傳給未分配記憶體的const char* （LPCTSTR）指標. CString cstr="ABC"; const char* ch

java unicode與字串轉換

package util; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * * <p>Title: String 與 Unicode 互相

中文與unicode的相互轉換例項

【1】中文轉unicode 程式碼例項如下： /** * * 將String轉換成unicode編碼格式 * @param str * @return String * @thro

utf8與unicode轉換

相關推薦