using System; using System.Collections.Generic; using System.Diagnostics; using System.Drawing; using System.Drawing.Imaging; using System.IO; using System.Linq; using OpenCvSharp; namespace OCRTest { /// /// OCR训练数据准备工具 /// 用于准备和预处理训练图片 /// public class OCRTrainingHelper { /// /// 批量预处理训练图片(灰度化、二值化、调整大小) /// /// 输入文件夹 /// 输出文件夹 /// 目标宽度(像素) /// 目标高度(像素,0表示自动) public static void PreprocessTrainingImages(string inputFolder, string outputFolder, int targetWidth = 800, int targetHeight = 0) { if (!Directory.Exists(inputFolder)) { throw new DirectoryNotFoundException($"输入文件夹不存在:{inputFolder}"); } if (!Directory.Exists(outputFolder)) { Directory.CreateDirectory(outputFolder); } var imageFiles = Directory.GetFiles(inputFolder, "*.{png,jpg,jpeg,bmp,tif,tiff}") .SelectMany(pattern => Directory.GetFiles(inputFolder, pattern)) .Distinct() .ToArray(); Console.WriteLine($"找到 {imageFiles.Length} 张图片"); int successCount = 0; int failCount = 0; foreach (var inputFile in imageFiles) { try { string fileName = Path.GetFileNameWithoutExtension(inputFile); string outputFile = Path.Combine(outputFolder, $"{fileName}.png"); PreprocessSingleImage(inputFile, outputFile, targetWidth, targetHeight); successCount++; Console.WriteLine($"[{successCount + failCount}/{imageFiles.Length}] 处理成功:{fileName}"); } catch (Exception ex) { failCount++; Console.WriteLine($"处理失败:{Path.GetFileName(inputFile)} - {ex.Message}"); } } Console.WriteLine($"\n处理完成!成功:{successCount},失败:{failCount}"); } /// /// 预处理单张图片 /// private static void PreprocessSingleImage(string inputFile, string outputFile, int targetWidth, int targetHeight) { using (var src = Cv2.ImRead(inputFile, ImreadModes.Color)) { if (src.Empty()) { throw new Exception("无法读取图片"); } Mat processed = src.Clone(); // 1. 调整大小 if (targetWidth > 0 || targetHeight > 0) { double scale; if (targetHeight == 0) { scale = (double)targetWidth / src.Width; targetHeight = (int)(src.Height * scale); } else if (targetWidth == 0) { scale = (double)targetHeight / src.Height; targetWidth = (int)(src.Width * scale); } else { scale = Math.Min((double)targetWidth / src.Width, (double)targetHeight / src.Height); targetWidth = (int)(src.Width * scale); targetHeight = (int)(src.Height * scale); } Cv2.Resize(processed, processed, new OpenCvSharp.Size(targetWidth, targetHeight)); } // 2. 转换为灰度图 if (processed.Channels() > 1) { Cv2.CvtColor(processed, processed, ColorConversionCodes.BGR2GRAY); } // 3. 降噪(高斯模糊) Cv2.GaussianBlur(processed, processed, new OpenCvSharp.Size(3, 3), 0); // 4. 自适应二值化 Cv2.AdaptiveThreshold(processed, processed, 255, AdaptiveThresholdTypes.GaussianC, ThresholdTypes.Binary, 11, 2); // 5. 保存为PNG(无损压缩) Cv2.ImWrite(outputFile, processed); } } /// /// 生成训练样本的标注文件 /// /// 图片文件夹 /// 标签字典(文件名 -> 文本内容) public static void GenerateLabelFiles(string imageFolder, Dictionary labels) { foreach (var kvp in labels) { string imageName = kvp.Key; string text = kvp.Value; // 支持多种图片格式 string[] extensions = { ".png", ".jpg", ".jpeg", ".bmp", ".tif", ".tiff" }; string imagePath = null; foreach (var ext in extensions) { string path = Path.Combine(imageFolder, imageName + ext); if (File.Exists(path)) { imagePath = path; break; } } if (imagePath == null) { Console.WriteLine($"警告:找不到图片文件 {imageName}"); continue; } // 生成对应的txt文件 string txtPath = Path.ChangeExtension(imagePath, ".txt"); File.WriteAllText(txtPath, text, System.Text.Encoding.UTF8); Console.WriteLine($"生成标注文件:{Path.GetFileName(txtPath)}"); } } /// /// 验证训练数据集的完整性 /// /// 数据集文件夹 /// 缺失标注文件的图片列表 public static List ValidateDataset(string folder) { var missingLabels = new List(); var imageFiles = Directory.GetFiles(folder, "*.{png,jpg,jpeg,bmp,tif,tiff}") .SelectMany(pattern => Directory.GetFiles(folder, pattern)) .Distinct() .ToArray(); foreach (var imageFile in imageFiles) { string txtFile = Path.ChangeExtension(imageFile, ".txt"); if (!File.Exists(txtFile)) { missingLabels.Add(Path.GetFileName(imageFile)); } } if (missingLabels.Count > 0) { Console.WriteLine($"发现 {missingLabels.Count} 个缺少标注文件的图片:"); foreach (var file in missingLabels) { Console.WriteLine($" - {file}"); } } else { Console.WriteLine($"数据集完整!共 {imageFiles.Length} 个样本"); } return missingLabels; } /// /// 统计字符频率(用于生成字符集) /// /// 标注文件夹 /// 字符频率字典 public static Dictionary AnalyzeCharacterFrequency(string labelFolder) { var charFrequency = new Dictionary(); var txtFiles = Directory.GetFiles(labelFolder, "*.txt"); foreach (var txtFile in txtFiles) { string content = File.ReadAllText(txtFile, System.Text.Encoding.UTF8); foreach (char c in content) { if (charFrequency.ContainsKey(c)) { charFrequency[c]++; } else { charFrequency[c] = 1; } } } // 按频率排序 var sorted = charFrequency.OrderByDescending(kvp => kvp.Value).ToDictionary(kvp => kvp.Key, kvp => kvp.Value); Console.WriteLine("字符频率统计(前50个):"); int count = 0; foreach (var kvp in sorted.Take(50)) { string displayChar = kvp.Key == ' ' ? "(空格)" : kvp.Key.ToString(); Console.WriteLine($" '{displayChar}': {kvp.Value} 次"); count++; } Console.WriteLine($"\n总字符种类:{sorted.Count}"); return sorted; } /// /// 生成训练脚本(Windows批处理) /// /// 字体名称 /// 输出路径 public static void GenerateTrainingScript(string fontName, string outputPath) { string script = $@"@echo off echo ======================================== echo Tesseract OCR 训练脚本 echo 字体名称:{fontName} echo ======================================== echo. set LANG={fontName} set TESSDATA_PREFIX=%~dp0tessdata echo [1/8] 生成box文件... tesseract %LANG%.exp0.tif %LANG%.exp0 batch.nochop makebox if errorlevel 1 goto error echo [2/8] 生成训练文件... tesseract %LANG%.exp0.tif %LANG%.exp0 nobatch box.train if errorlevel 1 goto error echo [3/8] 提取字符集... unicharset_extractor %LANG%.exp0.box if errorlevel 1 goto error echo [4/8] 形状聚类... shapeclustering -F unicharset -O unicharset %LANG%.exp0.tr if errorlevel 1 goto error echo [5/8] MF训练... mftraining -F unicharset -U unicharset -O {fontName}.unicharset %LANG%.exp0.tr if errorlevel 1 goto error echo [6/8] CN训练... cntraining %LANG%.exp0.tr if errorlevel 1 goto error echo [7/8] 重命名文件... rename normproto {fontName}.normproto rename inttemp {fontName}.inttemp rename pffmtable {fontName}.pffmtable rename shapetable {fontName}.shapetable echo [8/8] 合并训练数据... combine_tessdata {fontName}. if errorlevel 1 goto error echo. echo ======================================== echo 训练完成! echo 生成的文件:{fontName}.traineddata echo 请将其复制到 tessdata 文件夹 echo ======================================== pause exit /b 0 :error echo. echo 训练失败!错误代码:%errorlevel% pause exit /b 1 "; File.WriteAllText(outputPath, script, System.Text.Encoding.UTF8); Console.WriteLine($"训练脚本已生成:{outputPath}"); } /// /// 创建示例训练项目 /// /// 项目文件夹 public static void CreateSampleProject(string projectFolder) { if (!Directory.Exists(projectFolder)) { Directory.CreateDirectory(projectFolder); } // 创建子文件夹 string rawFolder = Path.Combine(projectFolder, "raw_images"); string processedFolder = Path.Combine(projectFolder, "processed_images"); string labelsFolder = Path.Combine(projectFolder, "labels"); Directory.CreateDirectory(rawFolder); Directory.CreateDirectory(processedFolder); Directory.CreateDirectory(labelsFolder); // 创建README string readme = $@"# OCR训练项目 ## 文件夹说明 - **raw_images/**: 原始训练图片 - **processed_images/**: 预处理后的图片 - **labels/**: 标注文件(.txt格式) ## 使用步骤 ### 1. 准备训练数据 将训练图片放入 `raw_images` 文件夹 ### 2. 预处理图片 ```csharp OCRTrainingHelper.PreprocessTrainingImages( ""{rawFolder}"", ""{processedFolder}"", targetWidth: 800 ); ``` ### 3. 创建标注文件 为每张图片创建对应的 .txt 文件,内容为识别文本 ### 4. 验证数据集 ```csharp OCRTrainingHelper.ValidateDataset(""{processedFolder}""); ``` ### 5. 分析字符频率 ```csharp OCRTrainingHelper.AnalyzeCharacterFrequency(""{labelsFolder}""); ``` ### 6. 生成训练脚本 ```csharp OCRTrainingHelper.GenerateTrainingScript( ""myfont"", ""{projectFolder}/train.bat"" ); ``` ### 7. 执行训练 运行 `train.bat` 开始训练 ## 注意事项 - 每个样本至少需要100-500张图片 - 图片应覆盖所有需要识别的字符 - 标注文件必须与图片同名(仅扩展名不同) - 建议使用PNG格式保存预处理后的图片 "; File.WriteAllText(Path.Combine(projectFolder, "README.md"), readme); // 生成训练脚本 GenerateTrainingScript("myfont", Path.Combine(projectFolder, "train.bat")); Console.WriteLine($"示例项目已创建:{projectFolder}"); Console.WriteLine("请按照 README.md 中的说明进行操作"); } } }