192 lines
3.4 KiB
C++
192 lines
3.4 KiB
C++
#include <iconv.h>
|
||
#include <string.h>
|
||
#include <malloc.h>
|
||
|
||
#include "iconv-utils.h"
|
||
|
||
int preNUm(unsigned char byte)
|
||
{
|
||
unsigned char mask = 0x80;
|
||
int num = 0;
|
||
for (int i = 0; i < 8; i++)
|
||
{
|
||
if ((byte & mask) == mask)
|
||
{
|
||
mask = mask >> 1;
|
||
num++;
|
||
}
|
||
else
|
||
{
|
||
break;
|
||
}
|
||
}
|
||
return num;
|
||
}
|
||
|
||
|
||
bool isUtf8(unsigned char* data, int len)
|
||
{
|
||
int num = 0;
|
||
int i = 0;
|
||
while (i < len)
|
||
{
|
||
if ((data[i] & 0x80) == 0x00)
|
||
{
|
||
// 0XXX_XXXX
|
||
i++;
|
||
continue;
|
||
}
|
||
else if ((num = preNUm(data[i])) > 2)
|
||
{
|
||
// 110X_XXXX 10XX_XXXX
|
||
// 1110_XXXX 10XX_XXXX 10XX_XXXX
|
||
// 1111_0XXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
|
||
// 1111_10XX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
|
||
// 1111_110X 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
|
||
// preNUm() 返回首个字节8个bits中首<E4B8AD>?0bit前面1bit的个数,该数量也是该字符所使用的字节数
|
||
i++;
|
||
for (int j = 0; j < num - 1; j++)
|
||
{
|
||
//判断后面num - 1 个字节是不是都是10开
|
||
if ((data[i] & 0xc0) != 0x80)
|
||
{
|
||
return false;
|
||
}
|
||
i++;
|
||
}
|
||
}
|
||
else
|
||
{
|
||
//其他情况说明不是utf-8
|
||
return false;
|
||
}
|
||
}
|
||
return true;
|
||
}
|
||
|
||
bool isGBK(unsigned char* data, int len)
|
||
{
|
||
int i = 0;
|
||
while (i < len)
|
||
{
|
||
if (data[i] <= 0x7f)
|
||
{
|
||
//编码小于等于127,只有一个字节的编码,兼容ASCII
|
||
i++;
|
||
continue;
|
||
}
|
||
else
|
||
{
|
||
//大于127的使用双字节编码
|
||
if (data[i] >= 0x81 &&
|
||
data[i] <= 0xfe &&
|
||
data[i + 1] >= 0x40 &&
|
||
data[i + 1] <= 0xfe &&
|
||
data[i + 1] != 0xf7)
|
||
{
|
||
i += 2;
|
||
continue;
|
||
}
|
||
else
|
||
{
|
||
return false;
|
||
}
|
||
}
|
||
}
|
||
return true;
|
||
}
|
||
|
||
//需要说明的是,isGBK()是通过双字节是否落在gbk的编码范围内实现的,
|
||
//而utf-8编码格式的每个字节都是落在gbk的编码范围内<E59BB4>?
|
||
//所以只有先调用isUtf8()先判断不是utf-8编码,再调用isGBK()才有意义
|
||
CODING GetCoding(unsigned char* data, int len)
|
||
{
|
||
CODING coding;
|
||
if (isUtf8(data, len) == true)
|
||
{
|
||
coding = UTF8;
|
||
}
|
||
else if (isGBK(data, len) == true)
|
||
{
|
||
coding = GBK;
|
||
}
|
||
else
|
||
{
|
||
coding = UNKOWN;
|
||
}
|
||
return coding;
|
||
}
|
||
|
||
int code_convert(const char* from_charset, const char* to_charset, char* inbuf, size_t inlen, char* outbuf, size_t outlen)
|
||
{
|
||
iconv_t cd;
|
||
char** pin = &inbuf;
|
||
char** pout = &outbuf;
|
||
|
||
cd = iconv_open(to_charset, from_charset);
|
||
if (cd == 0)
|
||
return -1;
|
||
|
||
memset(outbuf, 0, outlen);
|
||
|
||
if ((int)iconv(cd, pin, &inlen, pout, &outlen) == -1)
|
||
{
|
||
iconv_close(cd);
|
||
return -1;
|
||
}
|
||
iconv_close(cd);
|
||
*pout = '\0';
|
||
|
||
return 0;
|
||
}
|
||
|
||
int u2g(char* inbuf, size_t inlen, char* outbuf, size_t outlen)
|
||
{
|
||
return code_convert("utf-8", "gb2312", inbuf, inlen, outbuf, outlen);
|
||
}
|
||
|
||
int g2u(char* inbuf, size_t inlen, char* outbuf, size_t outlen)
|
||
{
|
||
return code_convert("gb2312", "utf-8", inbuf, inlen, outbuf, outlen);
|
||
}
|
||
|
||
bool GBKToUTF8(const std::string& strGBK,std::string& str_result)
|
||
{
|
||
int length = strGBK.size() * 2 + 1;
|
||
|
||
char* temp = (char*)malloc(sizeof(char) * length);
|
||
|
||
if (g2u((char*)strGBK.c_str(), strGBK.size(), temp, length) >= 0)
|
||
{
|
||
str_result.append(temp);
|
||
free(temp);
|
||
return true;
|
||
}
|
||
else
|
||
{
|
||
free(temp);
|
||
str_result = "";
|
||
return false;
|
||
}
|
||
}
|
||
|
||
bool UTFtoGBK(const char* utf8, std::string& str_result)
|
||
{
|
||
int length = strlen(utf8);
|
||
|
||
char* temp = (char*)malloc(sizeof(char) * length);
|
||
|
||
if (u2g((char*)utf8, length, temp, length) >= 0)
|
||
{
|
||
str_result.append(temp);
|
||
free(temp);
|
||
|
||
return true;
|
||
}
|
||
else
|
||
{
|
||
free(temp);
|
||
str_result = "";
|
||
return false;
|
||
}
|
||
} |