2024-05-24 12:19:45 +08:00
|
|
|
|
#include <iconv.h>
|
|
|
|
|
#include <string.h>
|
|
|
|
|
#include <malloc.h>
|
|
|
|
|
|
|
|
|
|
#include "iconv-utils.h"
|
|
|
|
|
|
|
|
|
|
int preNUm(unsigned char byte)
|
|
|
|
|
{
|
|
|
|
|
unsigned char mask = 0x80;
|
|
|
|
|
int num = 0;
|
|
|
|
|
for (int i = 0; i < 8; i++)
|
|
|
|
|
{
|
|
|
|
|
if ((byte & mask) == mask)
|
|
|
|
|
{
|
|
|
|
|
mask = mask >> 1;
|
|
|
|
|
num++;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return num;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
bool isUtf8(unsigned char* data, int len)
|
|
|
|
|
{
|
|
|
|
|
int num = 0;
|
|
|
|
|
int i = 0;
|
|
|
|
|
while (i < len)
|
|
|
|
|
{
|
|
|
|
|
if ((data[i] & 0x80) == 0x00)
|
|
|
|
|
{
|
|
|
|
|
// 0XXX_XXXX
|
|
|
|
|
i++;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
else if ((num = preNUm(data[i])) > 2)
|
|
|
|
|
{
|
|
|
|
|
// 110X_XXXX 10XX_XXXX
|
|
|
|
|
// 1110_XXXX 10XX_XXXX 10XX_XXXX
|
|
|
|
|
// 1111_0XXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
|
|
|
|
|
// 1111_10XX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
|
|
|
|
|
// 1111_110X 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
|
|
|
|
|
// preNUm() 返回首个字节8个bits中首<E4B8AD>?0bit前面1bit的个数,该数量也是该字符所使用的字节数
|
|
|
|
|
i++;
|
|
|
|
|
for (int j = 0; j < num - 1; j++)
|
|
|
|
|
{
|
|
|
|
|
//判断后面num - 1 个字节是不是都是10开
|
|
|
|
|
if ((data[i] & 0xc0) != 0x80)
|
|
|
|
|
{
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
i++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
//其他情况说明不是utf-8
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool isGBK(unsigned char* data, int len)
|
|
|
|
|
{
|
|
|
|
|
int i = 0;
|
|
|
|
|
while (i < len)
|
|
|
|
|
{
|
|
|
|
|
if (data[i] <= 0x7f)
|
|
|
|
|
{
|
|
|
|
|
//编码小于等于127,只有一个字节的编码,兼容ASCII
|
|
|
|
|
i++;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
//大于127的使用双字节编码
|
|
|
|
|
if (data[i] >= 0x81 &&
|
|
|
|
|
data[i] <= 0xfe &&
|
|
|
|
|
data[i + 1] >= 0x40 &&
|
|
|
|
|
data[i + 1] <= 0xfe &&
|
|
|
|
|
data[i + 1] != 0xf7)
|
|
|
|
|
{
|
|
|
|
|
i += 2;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
//需要说明的是,isGBK()是通过双字节是否落在gbk的编码范围内实现的,
|
|
|
|
|
//而utf-8编码格式的每个字节都是落在gbk的编码范围内<E59BB4>?
|
|
|
|
|
//所以只有先调用isUtf8()先判断不是utf-8编码,再调用isGBK()才有意义
|
|
|
|
|
CODING GetCoding(unsigned char* data, int len)
|
|
|
|
|
{
|
|
|
|
|
CODING coding;
|
|
|
|
|
if (isUtf8(data, len) == true)
|
|
|
|
|
{
|
|
|
|
|
coding = UTF8;
|
|
|
|
|
}
|
|
|
|
|
else if (isGBK(data, len) == true)
|
|
|
|
|
{
|
|
|
|
|
coding = GBK;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
coding = UNKOWN;
|
|
|
|
|
}
|
|
|
|
|
return coding;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int code_convert(const char* from_charset, const char* to_charset, char* inbuf, size_t inlen, char* outbuf, size_t outlen)
|
|
|
|
|
{
|
|
|
|
|
iconv_t cd;
|
|
|
|
|
char** pin = &inbuf;
|
|
|
|
|
char** pout = &outbuf;
|
|
|
|
|
|
|
|
|
|
cd = iconv_open(to_charset, from_charset);
|
|
|
|
|
if (cd == 0)
|
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
|
|
memset(outbuf, 0, outlen);
|
|
|
|
|
|
|
|
|
|
if ((int)iconv(cd, pin, &inlen, pout, &outlen) == -1)
|
|
|
|
|
{
|
|
|
|
|
iconv_close(cd);
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
iconv_close(cd);
|
2024-09-05 16:36:20 +08:00
|
|
|
|
#ifndef _WIN32
|
|
|
|
|
* pout = '\0';
|
|
|
|
|
#else
|
|
|
|
|
*pout = (char*)'\0';
|
|
|
|
|
#endif
|
2024-05-24 12:19:45 +08:00
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int u2g(char* inbuf, size_t inlen, char* outbuf, size_t outlen)
|
|
|
|
|
{
|
|
|
|
|
return code_convert("utf-8", "gb2312", inbuf, inlen, outbuf, outlen);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int g2u(char* inbuf, size_t inlen, char* outbuf, size_t outlen)
|
|
|
|
|
{
|
|
|
|
|
return code_convert("gb2312", "utf-8", inbuf, inlen, outbuf, outlen);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool GBKToUTF8(const std::string& strGBK,std::string& str_result)
|
|
|
|
|
{
|
2024-11-15 18:22:43 +08:00
|
|
|
|
size_t length = strGBK.size() * 2 + 1;
|
2024-05-24 12:19:45 +08:00
|
|
|
|
|
|
|
|
|
char* temp = (char*)malloc(sizeof(char) * length);
|
|
|
|
|
|
|
|
|
|
if (g2u((char*)strGBK.c_str(), strGBK.size(), temp, length) >= 0)
|
|
|
|
|
{
|
|
|
|
|
str_result.append(temp);
|
|
|
|
|
free(temp);
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
free(temp);
|
|
|
|
|
str_result = "";
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool UTFtoGBK(const char* utf8, std::string& str_result)
|
|
|
|
|
{
|
2024-11-15 18:22:43 +08:00
|
|
|
|
size_t length = strlen(utf8);
|
2024-05-24 12:19:45 +08:00
|
|
|
|
|
|
|
|
|
char* temp = (char*)malloc(sizeof(char) * length);
|
|
|
|
|
|
|
|
|
|
if (u2g((char*)utf8, length, temp, length) >= 0)
|
|
|
|
|
{
|
|
|
|
|
str_result.append(temp);
|
|
|
|
|
free(temp);
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
free(temp);
|
|
|
|
|
str_result = "";
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|