Browse Source

加入编码格式判断

maoyehu 2 years ago
parent
commit
dfec90d823
1 changed files with 59 additions and 8 deletions
  1. 59 8
      MFCApplication1/AssignWords.cpp

+ 59 - 8
MFCApplication1/AssignWords.cpp

@@ -1,4 +1,4 @@
-#include "pch.h"
+锘�#include "pch.h"
 #include "AssignWords.h"
 #include "CvxText.h"
 #include <iostream>
@@ -68,8 +68,8 @@ void DivideEquallyStr(const char *buff, int len, char left[], char right[])
 	strncpy(left, buff, i);
 	strncpy(right, buff + i, len - i);
 	return;
-	/* 直接中间分开,不用特殊处理*/
-	std::string strKey1 = "", strKey2 = " ";
+	/* 鐩存帴涓�棿鍒嗗紑锛屼笉鐢ㄧ壒娈婂�鐞�*/
+	std::string strKey1 = "锛�", strKey2 = " ";
 	std::size_t found = strtemp.find(strKey1);
 	bool flag1 = false, flag2 = false;
 	while (found != std::string::npos && found < strtemp.length() / 2)
@@ -129,7 +129,7 @@ void DivideEquallyStr(const char *buff, int len, char left[], char right[])
 
 void RandReplaceStr(std::vector<std::string> list, std::string &ret)
 {
-	std::string to_replaced = "";
+	std::string to_replaced = "锛�";
 	srand((unsigned)time(NULL));
 	std::string newchars;
 	int index = rand() % list.size();
@@ -184,13 +184,60 @@ void AssignWordsFromString(std::string strContent, int maxTwoWith, int maxTreeWi
 	}
 }
 
+// 鍒ゆ柇鏄�惁鏄痷tf-8
+int preNUm(unsigned char byte) {
+	unsigned char mask = 0x80;
+	int num = 0;
+	for (int i = 0; i < 8; i++) {
+		if ((byte & mask) == mask) {
+			mask = mask >> 1;
+			num++;
+		}
+		else {
+			break;
+		}
+	}
+	return num;
+}
 
+bool isUtf8(unsigned char* data, int len) {
+	int num = 0;
+	int i = 0;
+	while (i < len) {
+		if ((data[i] & 0x80) == 0x00) {
+			// 0XXX_XXXX
+			i++;
+			continue;
+		}
+		else if ((num = preNUm(data[i])) > 2) {
+			// 110X_XXXX 10XX_XXXX
+			// 1110_XXXX 10XX_XXXX 10XX_XXXX
+			// 1111_0XXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
+			// 1111_10XX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
+			// 1111_110X 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
+			// preNUm() 杩斿洖棣栦釜瀛楄妭8涓猙its涓��锟�?0bit鍓嶉潰1bit鐨勪釜鏁帮紝璇ユ暟閲忎篃鏄��瀛楃�鎵€浣跨敤鐨勫瓧鑺傛暟        
+			i++;
+			for (int j = 0; j < num - 1; j++) {
+				//鍒ゆ柇鍚庨潰num - 1 涓�瓧鑺傛槸涓嶆槸閮芥槸10寮€
+				if ((data[i] & 0xc0) != 0x80) {
+					return false;
+				}
+				i++;
+			}
+		}
+		else {
+			//鍏朵粬鎯呭喌璇存槑涓嶆槸utf-8
+			return false;
+		}
+	}
+	return true;
+}
 
 int AssignWordsFromTest(std::string pathName, std::vector<tuple< string, string>> &oneList, std::vector<tuple< string, string>> &twoList,
 	std::vector<tuple< string, string>> &threeList, std::vector<tuple< string, string>> &fourList, int &ret)
 {
 	int nHangCount = 1;
-	// 词性处理
+	// 璇嶆€у�鐞�
 	HMODULE module = GetModuleHandle(0);
 	TCHAR pFileName[MAX_PATH + 2] = { 0 };
 	GetModuleFileName(module, pFileName, MAX_PATH);
@@ -241,8 +288,12 @@ int AssignWordsFromTest(std::string pathName, std::vector<tuple< string, string>
 			}
 			
 			std::string strTemp = line,strTempAnother = lineOther;
-			UTF8toANSI(strTemp);
-			UTF8toANSI(strTempAnother);
+			if (isUtf8((unsigned char*)strTemp.c_str(), strTemp.length()))
+				UTF8toANSI(strTemp);
+			//UTF8toANSI(strTempAnother);
+			if(isUtf8((unsigned char*)strTempAnother.c_str(), strTempAnother.length()))
+				UTF8toANSI(strTempAnother);
+			//strTempAnother;
 			if (strTemp == "" && strTempAnother=="")
 				continue;
 			else if (strTemp == "")
@@ -276,7 +327,7 @@ int AssignWordsFromTest(std::string pathName, std::vector<tuple< string, string>
 			memset(lineOther, 0, 4096);
 			nHangCount++;
 		}
-		// 源文件结束,查看答案文件是否结束
+		// 婧愭枃浠剁粨鏉燂紝鏌ョ湅绛旀�鏂囦欢鏄�惁缁撴潫
 		if (finOther.getline(lineOther, sizeof(lineOther)))
 		{
 			return -2;