#include #include #include #include "iconv-utils.h" int preNUm(unsigned char byte) { unsigned char mask = 0x80; int num = 0; for (int i = 0; i < 8; i++) { if ((byte & mask) == mask) { mask = mask >> 1; num++; } else { break; } } return num; } bool isUtf8(unsigned char* data, int len) { int num = 0; int i = 0; while (i < len) { if ((data[i] & 0x80) == 0x00) { // 0XXX_XXXX i++; continue; } else if ((num = preNUm(data[i])) > 2) { // 110X_XXXX 10XX_XXXX // 1110_XXXX 10XX_XXXX 10XX_XXXX // 1111_0XXX 10XX_XXXX 10XX_XXXX 10XX_XXXX // 1111_10XX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX // 1111_110X 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX // preNUm() 返回首个字节8个bits中首�?0bit前面1bit的个数,该数量也是该字符所使用的字节数 i++; for (int j = 0; j < num - 1; j++) { //判断后面num - 1 个字节是不是都是10开 if ((data[i] & 0xc0) != 0x80) { return false; } i++; } } else { //其他情况说明不是utf-8 return false; } } return true; } bool isGBK(unsigned char* data, int len) { int i = 0; while (i < len) { if (data[i] <= 0x7f) { //编码小于等于127,只有一个字节的编码,兼容ASCII i++; continue; } else { //大于127的使用双字节编码 if (data[i] >= 0x81 && data[i] <= 0xfe && data[i + 1] >= 0x40 && data[i + 1] <= 0xfe && data[i + 1] != 0xf7) { i += 2; continue; } else { return false; } } } return true; } //需要说明的是,isGBK()是通过双字节是否落在gbk的编码范围内实现的, //而utf-8编码格式的每个字节都是落在gbk的编码范围内�? //所以只有先调用isUtf8()先判断不是utf-8编码,再调用isGBK()才有意义 CODING GetCoding(unsigned char* data, int len) { CODING coding; if (isUtf8(data, len) == true) { coding = UTF8; } else if (isGBK(data, len) == true) { coding = GBK; } else { coding = UNKOWN; } return coding; } int code_convert(const char* from_charset, const char* to_charset, char* inbuf, size_t inlen, char* outbuf, size_t outlen) { iconv_t cd; char** pin = &inbuf; char** pout = &outbuf; cd = iconv_open(to_charset, from_charset); if (cd == 0) return -1; memset(outbuf, 0, outlen); if ((int)iconv(cd, pin, &inlen, pout, &outlen) == -1) { iconv_close(cd); return -1; } iconv_close(cd); #ifndef _WIN32 * pout = '\0'; #else *pout = (char*)'\0'; #endif return 0; } int u2g(char* inbuf, size_t inlen, char* outbuf, size_t outlen) { return code_convert("utf-8", "gb2312", inbuf, inlen, outbuf, outlen); } int g2u(char* inbuf, size_t inlen, char* outbuf, size_t outlen) { return code_convert("gb2312", "utf-8", inbuf, inlen, outbuf, outlen); } bool GBKToUTF8(const std::string& strGBK,std::string& str_result) { int length = strGBK.size() * 2 + 1; char* temp = (char*)malloc(sizeof(char) * length); if (g2u((char*)strGBK.c_str(), strGBK.size(), temp, length) >= 0) { str_result.append(temp); free(temp); return true; } else { free(temp); str_result = ""; return false; } } bool UTFtoGBK(const char* utf8, std::string& str_result) { int length = strlen(utf8); char* temp = (char*)malloc(sizeof(char) * length); if (u2g((char*)utf8, length, temp, length) >= 0) { str_result.append(temp); free(temp); return true; } else { free(temp); str_result = ""; return false; } }