emsApplication/applications/ems_datahubs/iconv-utils.cpp

195 lines
3.5 KiB
C++
Raw Normal View History

2024-05-24 12:19:45 +08:00
#include <iconv.h>
#include <string.h>
#include <malloc.h>
#include "iconv-utils.h"
int preNUm(unsigned char byte)
{
unsigned char mask = 0x80;
int num = 0;
for (int i = 0; i < 8; i++)
{
if ((byte & mask) == mask)
{
mask = mask >> 1;
num++;
}
else
{
break;
}
}
return num;
}
bool isUtf8(unsigned char* data, int len)
{
int num = 0;
int i = 0;
while (i < len)
{
if ((data[i] & 0x80) == 0x00)
{
// 0XXX_XXXX
i++;
continue;
}
else if ((num = preNUm(data[i])) > 2)
{
// 110X_XXXX 10XX_XXXX
// 1110_XXXX 10XX_XXXX 10XX_XXXX
// 1111_0XXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
// 1111_10XX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
// 1111_110X 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
// preNUm() 返回首个字节8个bits中首<E4B8AD>?0bit前面1bit的个数该数量也是该字符所使用的字节数
i++;
for (int j = 0; j < num - 1; j++)
{
//判断后面num - 1 个字节是不是都是10开
if ((data[i] & 0xc0) != 0x80)
{
return false;
}
i++;
}
}
else
{
//其他情况说明不是utf-8
return false;
}
}
return true;
}
bool isGBK(unsigned char* data, int len)
{
int i = 0;
while (i < len)
{
if (data[i] <= 0x7f)
{
//编码小于等于127,只有一个字节的编码兼容ASCII
i++;
continue;
}
else
{
//大于127的使用双字节编码
if (data[i] >= 0x81 &&
data[i] <= 0xfe &&
data[i + 1] >= 0x40 &&
data[i + 1] <= 0xfe &&
data[i + 1] != 0xf7)
{
i += 2;
continue;
}
else
{
return false;
}
}
}
return true;
}
//需要说明的是isGBK()是通过双字节是否落在gbk的编码范围内实现的
//而utf-8编码格式的每个字节都是落在gbk的编码范围内<E59BB4>?
//所以只有先调用isUtf8()先判断不是utf-8编码再调用isGBK()才有意义
CODING GetCoding(unsigned char* data, int len)
{
CODING coding;
if (isUtf8(data, len) == true)
{
coding = UTF8;
}
else if (isGBK(data, len) == true)
{
coding = GBK;
}
else
{
coding = UNKOWN;
}
return coding;
}
int code_convert(const char* from_charset, const char* to_charset, char* inbuf, size_t inlen, char* outbuf, size_t outlen)
{
iconv_t cd;
char** pin = &inbuf;
char** pout = &outbuf;
cd = iconv_open(to_charset, from_charset);
if (cd == 0)
return -1;
memset(outbuf, 0, outlen);
if ((int)iconv(cd, pin, &inlen, pout, &outlen) == -1)
{
iconv_close(cd);
return -1;
}
iconv_close(cd);
2024-09-05 16:36:20 +08:00
#ifndef _WIN32
* pout = '\0';
#else
*pout = (char*)'\0';
#endif
2024-05-24 12:19:45 +08:00
return 0;
}
int u2g(char* inbuf, size_t inlen, char* outbuf, size_t outlen)
{
return code_convert("utf-8", "gb2312", inbuf, inlen, outbuf, outlen);
}
int g2u(char* inbuf, size_t inlen, char* outbuf, size_t outlen)
{
return code_convert("gb2312", "utf-8", inbuf, inlen, outbuf, outlen);
}
bool GBKToUTF8(const std::string& strGBK,std::string& str_result)
{
2024-11-15 18:22:43 +08:00
size_t length = strGBK.size() * 2 + 1;
2024-05-24 12:19:45 +08:00
char* temp = (char*)malloc(sizeof(char) * length);
if (g2u((char*)strGBK.c_str(), strGBK.size(), temp, length) >= 0)
{
str_result.append(temp);
free(temp);
return true;
}
else
{
free(temp);
str_result = "";
return false;
}
}
bool UTFtoGBK(const char* utf8, std::string& str_result)
{
2024-11-15 18:22:43 +08:00
size_t length = strlen(utf8);
2024-05-24 12:19:45 +08:00
char* temp = (char*)malloc(sizeof(char) * length);
if (u2g((char*)utf8, length, temp, length) >= 0)
{
str_result.append(temp);
free(temp);
return true;
}
else
{
free(temp);
str_result = "";
return false;
}
}