Browse Source

UTF8 ANSI 文件判断

maoyehu 2 years ago
parent
commit
63c20f98f9
1 changed files with 52 additions and 1 deletions
  1. 52 1
      MFCApplication1/AssignWords.cpp

+ 52 - 1
MFCApplication1/AssignWords.cpp

@@ -21,6 +21,56 @@ string& replace_str(string& str, const string& to_replaced, const string& newcha
 }
 */
 
+
+int preNUm(unsigned char byte) {
+	unsigned char mask = 0x80;
+	int num = 0;
+	for (int i = 0; i < 8; i++) {
+		if ((byte & mask) == mask) {
+			mask = mask >> 1;
+			num++;
+		}
+		else {
+			break;
+		}
+	}
+	return num;
+}
+
+
+bool isUtf8(unsigned char* data, int len) {
+	int num = 0;
+	int i = 0;
+	while (i < len) {
+		if ((data[i] & 0x80) == 0x00) {
+			// 0XXX_XXXX
+			i++;
+			continue;
+		}
+		else if ((num = preNUm(data[i])) > 2) {
+			// 110X_XXXX 10XX_XXXX
+			// 1110_XXXX 10XX_XXXX 10XX_XXXX
+			// 1111_0XXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
+			// 1111_10XX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
+			// 1111_110X 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
+			// preNUm() 返回首个字节8个bits中首 ?0bit前面1bit的个数,该数量也是该字符所使用的字节数        
+			i++;
+			for (int j = 0; j < num - 1; j++) {
+				//判断后面num - 1 个字节是不是都是10开
+				if ((data[i] & 0xc0) != 0x80) {
+					return false;
+				}
+				i++;
+			}
+		}
+		else {
+			//其他情况说明不是utf-8
+			return false;
+		}
+	}
+	return true;
+}
+
 void split(const std::string& s, const std::string& delim, std::vector< std::string >* ret)
 {
 	size_t last = 0;
@@ -264,7 +314,8 @@ int AssignWordsFromTest(std::string pathName, std::vector<std::string> &twoList,
 		while (fin.getline(line, sizeof(line)))
 		{
 			std::string strTemp = line;
-			UTF8toANSI(strTemp);
+			if(isUtf8((unsigned char*)strTemp.c_str(),strTemp.length()))
+				UTF8toANSI(strTemp);
 			if (lpszflagWord == "0")
 			{
 				ReplaceFlagWord(WordList, strTemp);