emsApplication/applications/ems_datahubs/iconv-utils.cpp

195 lines
3.5 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#include <iconv.h>
#include <string.h>
#include <malloc.h>
#include "iconv-utils.h"
int preNUm(unsigned char byte)
{
unsigned char mask = 0x80;
int num = 0;
for (int i = 0; i < 8; i++)
{
if ((byte & mask) == mask)
{
mask = mask >> 1;
num++;
}
else
{
break;
}
}
return num;
}
bool isUtf8(unsigned char* data, int len)
{
int num = 0;
int i = 0;
while (i < len)
{
if ((data[i] & 0x80) == 0x00)
{
// 0XXX_XXXX
i++;
continue;
}
else if ((num = preNUm(data[i])) > 2)
{
// 110X_XXXX 10XX_XXXX
// 1110_XXXX 10XX_XXXX 10XX_XXXX
// 1111_0XXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
// 1111_10XX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
// 1111_110X 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
// preNUm() 返回首个字节8个bits中首<E4B8AD>?0bit前面1bit的个数该数量也是该字符所使用的字节数
i++;
for (int j = 0; j < num - 1; j++)
{
//判断后面num - 1 个字节是不是都是10开
if ((data[i] & 0xc0) != 0x80)
{
return false;
}
i++;
}
}
else
{
//其他情况说明不是utf-8
return false;
}
}
return true;
}
bool isGBK(unsigned char* data, int len)
{
int i = 0;
while (i < len)
{
if (data[i] <= 0x7f)
{
//编码小于等于127,只有一个字节的编码兼容ASCII
i++;
continue;
}
else
{
//大于127的使用双字节编码
if (data[i] >= 0x81 &&
data[i] <= 0xfe &&
data[i + 1] >= 0x40 &&
data[i + 1] <= 0xfe &&
data[i + 1] != 0xf7)
{
i += 2;
continue;
}
else
{
return false;
}
}
}
return true;
}
//需要说明的是isGBK()是通过双字节是否落在gbk的编码范围内实现的
//而utf-8编码格式的每个字节都是落在gbk的编码范围内<E59BB4>?
//所以只有先调用isUtf8()先判断不是utf-8编码再调用isGBK()才有意义
CODING GetCoding(unsigned char* data, int len)
{
CODING coding;
if (isUtf8(data, len) == true)
{
coding = UTF8;
}
else if (isGBK(data, len) == true)
{
coding = GBK;
}
else
{
coding = UNKOWN;
}
return coding;
}
int code_convert(const char* from_charset, const char* to_charset, char* inbuf, size_t inlen, char* outbuf, size_t outlen)
{
iconv_t cd;
char** pin = &inbuf;
char** pout = &outbuf;
cd = iconv_open(to_charset, from_charset);
if (cd == 0)
return -1;
memset(outbuf, 0, outlen);
if ((int)iconv(cd, pin, &inlen, pout, &outlen) == -1)
{
iconv_close(cd);
return -1;
}
iconv_close(cd);
#ifndef _WIN32
* pout = '\0';
#else
*pout = (char*)'\0';
#endif
return 0;
}
int u2g(char* inbuf, size_t inlen, char* outbuf, size_t outlen)
{
return code_convert("utf-8", "gb2312", inbuf, inlen, outbuf, outlen);
}
int g2u(char* inbuf, size_t inlen, char* outbuf, size_t outlen)
{
return code_convert("gb2312", "utf-8", inbuf, inlen, outbuf, outlen);
}
bool GBKToUTF8(const std::string& strGBK,std::string& str_result)
{
int length = strGBK.size() * 2 + 1;
char* temp = (char*)malloc(sizeof(char) * length);
if (g2u((char*)strGBK.c_str(), strGBK.size(), temp, length) >= 0)
{
str_result.append(temp);
free(temp);
return true;
}
else
{
free(temp);
str_result = "";
return false;
}
}
bool UTFtoGBK(const char* utf8, std::string& str_result)
{
int length = strlen(utf8);
char* temp = (char*)malloc(sizeof(char) * length);
if (u2g((char*)utf8, length, temp, length) >= 0)
{
str_result.append(temp);
free(temp);
return true;
}
else
{
free(temp);
str_result = "";
return false;
}
}