OCRTrainingHelper.cs 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405
  1. using System;
  2. using System.Collections.Generic;
  3. using System.Diagnostics;
  4. using System.Drawing;
  5. using System.Drawing.Imaging;
  6. using System.IO;
  7. using System.Linq;
  8. using OpenCvSharp;
  9. namespace OCRTest
  10. {
  11. /// <summary>
  12. /// OCR训练数据准备工具
  13. /// 用于准备和预处理训练图片
  14. /// </summary>
  15. public class OCRTrainingHelper
  16. {
  17. /// <summary>
  18. /// 批量预处理训练图片(灰度化、二值化、调整大小)
  19. /// </summary>
  20. /// <param name="inputFolder">输入文件夹</param>
  21. /// <param name="outputFolder">输出文件夹</param>
  22. /// <param name="targetWidth">目标宽度(像素)</param>
  23. /// <param name="targetHeight">目标高度(像素,0表示自动)</param>
  24. public static void PreprocessTrainingImages(string inputFolder, string outputFolder,
  25. int targetWidth = 800, int targetHeight = 0)
  26. {
  27. if (!Directory.Exists(inputFolder))
  28. {
  29. throw new DirectoryNotFoundException($"输入文件夹不存在:{inputFolder}");
  30. }
  31. if (!Directory.Exists(outputFolder))
  32. {
  33. Directory.CreateDirectory(outputFolder);
  34. }
  35. var imageFiles = Directory.GetFiles(inputFolder, "*.{png,jpg,jpeg,bmp,tif,tiff}")
  36. .SelectMany(pattern => Directory.GetFiles(inputFolder, pattern))
  37. .Distinct()
  38. .ToArray();
  39. Console.WriteLine($"找到 {imageFiles.Length} 张图片");
  40. int successCount = 0;
  41. int failCount = 0;
  42. foreach (var inputFile in imageFiles)
  43. {
  44. try
  45. {
  46. string fileName = Path.GetFileNameWithoutExtension(inputFile);
  47. string outputFile = Path.Combine(outputFolder, $"{fileName}.png");
  48. PreprocessSingleImage(inputFile, outputFile, targetWidth, targetHeight);
  49. successCount++;
  50. Console.WriteLine($"[{successCount + failCount}/{imageFiles.Length}] 处理成功:{fileName}");
  51. }
  52. catch (Exception ex)
  53. {
  54. failCount++;
  55. Console.WriteLine($"处理失败:{Path.GetFileName(inputFile)} - {ex.Message}");
  56. }
  57. }
  58. Console.WriteLine($"\n处理完成!成功:{successCount},失败:{failCount}");
  59. }
  60. /// <summary>
  61. /// 预处理单张图片
  62. /// </summary>
  63. private static void PreprocessSingleImage(string inputFile, string outputFile,
  64. int targetWidth, int targetHeight)
  65. {
  66. using (var src = Cv2.ImRead(inputFile, ImreadModes.Color))
  67. {
  68. if (src.Empty())
  69. {
  70. throw new Exception("无法读取图片");
  71. }
  72. Mat processed = src.Clone();
  73. // 1. 调整大小
  74. if (targetWidth > 0 || targetHeight > 0)
  75. {
  76. double scale;
  77. if (targetHeight == 0)
  78. {
  79. scale = (double)targetWidth / src.Width;
  80. targetHeight = (int)(src.Height * scale);
  81. }
  82. else if (targetWidth == 0)
  83. {
  84. scale = (double)targetHeight / src.Height;
  85. targetWidth = (int)(src.Width * scale);
  86. }
  87. else
  88. {
  89. scale = Math.Min((double)targetWidth / src.Width, (double)targetHeight / src.Height);
  90. targetWidth = (int)(src.Width * scale);
  91. targetHeight = (int)(src.Height * scale);
  92. }
  93. Cv2.Resize(processed, processed, new OpenCvSharp.Size(targetWidth, targetHeight));
  94. }
  95. // 2. 转换为灰度图
  96. if (processed.Channels() > 1)
  97. {
  98. Cv2.CvtColor(processed, processed, ColorConversionCodes.BGR2GRAY);
  99. }
  100. // 3. 降噪(高斯模糊)
  101. Cv2.GaussianBlur(processed, processed, new OpenCvSharp.Size(3, 3), 0);
  102. // 4. 自适应二值化
  103. Cv2.AdaptiveThreshold(processed, processed, 255,
  104. AdaptiveThresholdTypes.GaussianC, ThresholdTypes.Binary, 11, 2);
  105. // 5. 保存为PNG(无损压缩)
  106. Cv2.ImWrite(outputFile, processed);
  107. }
  108. }
  109. /// <summary>
  110. /// 生成训练样本的标注文件
  111. /// </summary>
  112. /// <param name="imageFolder">图片文件夹</param>
  113. /// <param name="labels">标签字典(文件名 -> 文本内容)</param>
  114. public static void GenerateLabelFiles(string imageFolder, Dictionary<string, string> labels)
  115. {
  116. foreach (var kvp in labels)
  117. {
  118. string imageName = kvp.Key;
  119. string text = kvp.Value;
  120. // 支持多种图片格式
  121. string[] extensions = { ".png", ".jpg", ".jpeg", ".bmp", ".tif", ".tiff" };
  122. string imagePath = null;
  123. foreach (var ext in extensions)
  124. {
  125. string path = Path.Combine(imageFolder, imageName + ext);
  126. if (File.Exists(path))
  127. {
  128. imagePath = path;
  129. break;
  130. }
  131. }
  132. if (imagePath == null)
  133. {
  134. Console.WriteLine($"警告:找不到图片文件 {imageName}");
  135. continue;
  136. }
  137. // 生成对应的txt文件
  138. string txtPath = Path.ChangeExtension(imagePath, ".txt");
  139. File.WriteAllText(txtPath, text, System.Text.Encoding.UTF8);
  140. Console.WriteLine($"生成标注文件:{Path.GetFileName(txtPath)}");
  141. }
  142. }
  143. /// <summary>
  144. /// 验证训练数据集的完整性
  145. /// </summary>
  146. /// <param name="folder">数据集文件夹</param>
  147. /// <returns>缺失标注文件的图片列表</returns>
  148. public static List<string> ValidateDataset(string folder)
  149. {
  150. var missingLabels = new List<string>();
  151. var imageFiles = Directory.GetFiles(folder, "*.{png,jpg,jpeg,bmp,tif,tiff}")
  152. .SelectMany(pattern => Directory.GetFiles(folder, pattern))
  153. .Distinct()
  154. .ToArray();
  155. foreach (var imageFile in imageFiles)
  156. {
  157. string txtFile = Path.ChangeExtension(imageFile, ".txt");
  158. if (!File.Exists(txtFile))
  159. {
  160. missingLabels.Add(Path.GetFileName(imageFile));
  161. }
  162. }
  163. if (missingLabels.Count > 0)
  164. {
  165. Console.WriteLine($"发现 {missingLabels.Count} 个缺少标注文件的图片:");
  166. foreach (var file in missingLabels)
  167. {
  168. Console.WriteLine($" - {file}");
  169. }
  170. }
  171. else
  172. {
  173. Console.WriteLine($"数据集完整!共 {imageFiles.Length} 个样本");
  174. }
  175. return missingLabels;
  176. }
  177. /// <summary>
  178. /// 统计字符频率(用于生成字符集)
  179. /// </summary>
  180. /// <param name="labelFolder">标注文件夹</param>
  181. /// <returns>字符频率字典</returns>
  182. public static Dictionary<char, int> AnalyzeCharacterFrequency(string labelFolder)
  183. {
  184. var charFrequency = new Dictionary<char, int>();
  185. var txtFiles = Directory.GetFiles(labelFolder, "*.txt");
  186. foreach (var txtFile in txtFiles)
  187. {
  188. string content = File.ReadAllText(txtFile, System.Text.Encoding.UTF8);
  189. foreach (char c in content)
  190. {
  191. if (charFrequency.ContainsKey(c))
  192. {
  193. charFrequency[c]++;
  194. }
  195. else
  196. {
  197. charFrequency[c] = 1;
  198. }
  199. }
  200. }
  201. // 按频率排序
  202. var sorted = charFrequency.OrderByDescending(kvp => kvp.Value).ToDictionary(kvp => kvp.Key, kvp => kvp.Value);
  203. Console.WriteLine("字符频率统计(前50个):");
  204. int count = 0;
  205. foreach (var kvp in sorted.Take(50))
  206. {
  207. string displayChar = kvp.Key == ' ' ? "(空格)" : kvp.Key.ToString();
  208. Console.WriteLine($" '{displayChar}': {kvp.Value} 次");
  209. count++;
  210. }
  211. Console.WriteLine($"\n总字符种类:{sorted.Count}");
  212. return sorted;
  213. }
  214. /// <summary>
  215. /// 生成训练脚本(Windows批处理)
  216. /// </summary>
  217. /// <param name="fontName">字体名称</param>
  218. /// <param name="outputPath">输出路径</param>
  219. public static void GenerateTrainingScript(string fontName, string outputPath)
  220. {
  221. string script = $@"@echo off
  222. echo ========================================
  223. echo Tesseract OCR 训练脚本
  224. echo 字体名称:{fontName}
  225. echo ========================================
  226. echo.
  227. set LANG={fontName}
  228. set TESSDATA_PREFIX=%~dp0tessdata
  229. echo [1/8] 生成box文件...
  230. tesseract %LANG%.exp0.tif %LANG%.exp0 batch.nochop makebox
  231. if errorlevel 1 goto error
  232. echo [2/8] 生成训练文件...
  233. tesseract %LANG%.exp0.tif %LANG%.exp0 nobatch box.train
  234. if errorlevel 1 goto error
  235. echo [3/8] 提取字符集...
  236. unicharset_extractor %LANG%.exp0.box
  237. if errorlevel 1 goto error
  238. echo [4/8] 形状聚类...
  239. shapeclustering -F unicharset -O unicharset %LANG%.exp0.tr
  240. if errorlevel 1 goto error
  241. echo [5/8] MF训练...
  242. mftraining -F unicharset -U unicharset -O {fontName}.unicharset %LANG%.exp0.tr
  243. if errorlevel 1 goto error
  244. echo [6/8] CN训练...
  245. cntraining %LANG%.exp0.tr
  246. if errorlevel 1 goto error
  247. echo [7/8] 重命名文件...
  248. rename normproto {fontName}.normproto
  249. rename inttemp {fontName}.inttemp
  250. rename pffmtable {fontName}.pffmtable
  251. rename shapetable {fontName}.shapetable
  252. echo [8/8] 合并训练数据...
  253. combine_tessdata {fontName}.
  254. if errorlevel 1 goto error
  255. echo.
  256. echo ========================================
  257. echo 训练完成!
  258. echo 生成的文件:{fontName}.traineddata
  259. echo 请将其复制到 tessdata 文件夹
  260. echo ========================================
  261. pause
  262. exit /b 0
  263. :error
  264. echo.
  265. echo 训练失败!错误代码:%errorlevel%
  266. pause
  267. exit /b 1
  268. ";
  269. File.WriteAllText(outputPath, script, System.Text.Encoding.UTF8);
  270. Console.WriteLine($"训练脚本已生成:{outputPath}");
  271. }
  272. /// <summary>
  273. /// 创建示例训练项目
  274. /// </summary>
  275. /// <param name="projectFolder">项目文件夹</param>
  276. public static void CreateSampleProject(string projectFolder)
  277. {
  278. if (!Directory.Exists(projectFolder))
  279. {
  280. Directory.CreateDirectory(projectFolder);
  281. }
  282. // 创建子文件夹
  283. string rawFolder = Path.Combine(projectFolder, "raw_images");
  284. string processedFolder = Path.Combine(projectFolder, "processed_images");
  285. string labelsFolder = Path.Combine(projectFolder, "labels");
  286. Directory.CreateDirectory(rawFolder);
  287. Directory.CreateDirectory(processedFolder);
  288. Directory.CreateDirectory(labelsFolder);
  289. // 创建README
  290. string readme = $@"# OCR训练项目
  291. ## 文件夹说明
  292. - **raw_images/**: 原始训练图片
  293. - **processed_images/**: 预处理后的图片
  294. - **labels/**: 标注文件(.txt格式)
  295. ## 使用步骤
  296. ### 1. 准备训练数据
  297. 将训练图片放入 `raw_images` 文件夹
  298. ### 2. 预处理图片
  299. ```csharp
  300. OCRTrainingHelper.PreprocessTrainingImages(
  301. ""{rawFolder}"",
  302. ""{processedFolder}"",
  303. targetWidth: 800
  304. );
  305. ```
  306. ### 3. 创建标注文件
  307. 为每张图片创建对应的 .txt 文件,内容为识别文本
  308. ### 4. 验证数据集
  309. ```csharp
  310. OCRTrainingHelper.ValidateDataset(""{processedFolder}"");
  311. ```
  312. ### 5. 分析字符频率
  313. ```csharp
  314. OCRTrainingHelper.AnalyzeCharacterFrequency(""{labelsFolder}"");
  315. ```
  316. ### 6. 生成训练脚本
  317. ```csharp
  318. OCRTrainingHelper.GenerateTrainingScript(
  319. ""myfont"",
  320. ""{projectFolder}/train.bat""
  321. );
  322. ```
  323. ### 7. 执行训练
  324. 运行 `train.bat` 开始训练
  325. ## 注意事项
  326. - 每个样本至少需要100-500张图片
  327. - 图片应覆盖所有需要识别的字符
  328. - 标注文件必须与图片同名(仅扩展名不同)
  329. - 建议使用PNG格式保存预处理后的图片
  330. ";
  331. File.WriteAllText(Path.Combine(projectFolder, "README.md"), readme);
  332. // 生成训练脚本
  333. GenerateTrainingScript("myfont", Path.Combine(projectFolder, "train.bat"));
  334. Console.WriteLine($"示例项目已创建:{projectFolder}");
  335. Console.WriteLine("请按照 README.md 中的说明进行操作");
  336. }
  337. }
  338. }