| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405 |
- using System;
- using System.Collections.Generic;
- using System.Diagnostics;
- using System.Drawing;
- using System.Drawing.Imaging;
- using System.IO;
- using System.Linq;
- using OpenCvSharp;
- namespace OCRTest
- {
- /// <summary>
- /// OCR训练数据准备工具
- /// 用于准备和预处理训练图片
- /// </summary>
- public class OCRTrainingHelper
- {
- /// <summary>
- /// 批量预处理训练图片(灰度化、二值化、调整大小)
- /// </summary>
- /// <param name="inputFolder">输入文件夹</param>
- /// <param name="outputFolder">输出文件夹</param>
- /// <param name="targetWidth">目标宽度(像素)</param>
- /// <param name="targetHeight">目标高度(像素,0表示自动)</param>
- public static void PreprocessTrainingImages(string inputFolder, string outputFolder,
- int targetWidth = 800, int targetHeight = 0)
- {
- if (!Directory.Exists(inputFolder))
- {
- throw new DirectoryNotFoundException($"输入文件夹不存在:{inputFolder}");
- }
- if (!Directory.Exists(outputFolder))
- {
- Directory.CreateDirectory(outputFolder);
- }
- var imageFiles = Directory.GetFiles(inputFolder, "*.{png,jpg,jpeg,bmp,tif,tiff}")
- .SelectMany(pattern => Directory.GetFiles(inputFolder, pattern))
- .Distinct()
- .ToArray();
- Console.WriteLine($"找到 {imageFiles.Length} 张图片");
- int successCount = 0;
- int failCount = 0;
- foreach (var inputFile in imageFiles)
- {
- try
- {
- string fileName = Path.GetFileNameWithoutExtension(inputFile);
- string outputFile = Path.Combine(outputFolder, $"{fileName}.png");
- PreprocessSingleImage(inputFile, outputFile, targetWidth, targetHeight);
- successCount++;
- Console.WriteLine($"[{successCount + failCount}/{imageFiles.Length}] 处理成功:{fileName}");
- }
- catch (Exception ex)
- {
- failCount++;
- Console.WriteLine($"处理失败:{Path.GetFileName(inputFile)} - {ex.Message}");
- }
- }
- Console.WriteLine($"\n处理完成!成功:{successCount},失败:{failCount}");
- }
- /// <summary>
- /// 预处理单张图片
- /// </summary>
- private static void PreprocessSingleImage(string inputFile, string outputFile,
- int targetWidth, int targetHeight)
- {
- using (var src = Cv2.ImRead(inputFile, ImreadModes.Color))
- {
- if (src.Empty())
- {
- throw new Exception("无法读取图片");
- }
- Mat processed = src.Clone();
- // 1. 调整大小
- if (targetWidth > 0 || targetHeight > 0)
- {
- double scale;
- if (targetHeight == 0)
- {
- scale = (double)targetWidth / src.Width;
- targetHeight = (int)(src.Height * scale);
- }
- else if (targetWidth == 0)
- {
- scale = (double)targetHeight / src.Height;
- targetWidth = (int)(src.Width * scale);
- }
- else
- {
- scale = Math.Min((double)targetWidth / src.Width, (double)targetHeight / src.Height);
- targetWidth = (int)(src.Width * scale);
- targetHeight = (int)(src.Height * scale);
- }
- Cv2.Resize(processed, processed, new OpenCvSharp.Size(targetWidth, targetHeight));
- }
- // 2. 转换为灰度图
- if (processed.Channels() > 1)
- {
- Cv2.CvtColor(processed, processed, ColorConversionCodes.BGR2GRAY);
- }
- // 3. 降噪(高斯模糊)
- Cv2.GaussianBlur(processed, processed, new OpenCvSharp.Size(3, 3), 0);
- // 4. 自适应二值化
- Cv2.AdaptiveThreshold(processed, processed, 255,
- AdaptiveThresholdTypes.GaussianC, ThresholdTypes.Binary, 11, 2);
- // 5. 保存为PNG(无损压缩)
- Cv2.ImWrite(outputFile, processed);
- }
- }
- /// <summary>
- /// 生成训练样本的标注文件
- /// </summary>
- /// <param name="imageFolder">图片文件夹</param>
- /// <param name="labels">标签字典(文件名 -> 文本内容)</param>
- public static void GenerateLabelFiles(string imageFolder, Dictionary<string, string> labels)
- {
- foreach (var kvp in labels)
- {
- string imageName = kvp.Key;
- string text = kvp.Value;
- // 支持多种图片格式
- string[] extensions = { ".png", ".jpg", ".jpeg", ".bmp", ".tif", ".tiff" };
- string imagePath = null;
- foreach (var ext in extensions)
- {
- string path = Path.Combine(imageFolder, imageName + ext);
- if (File.Exists(path))
- {
- imagePath = path;
- break;
- }
- }
- if (imagePath == null)
- {
- Console.WriteLine($"警告:找不到图片文件 {imageName}");
- continue;
- }
- // 生成对应的txt文件
- string txtPath = Path.ChangeExtension(imagePath, ".txt");
- File.WriteAllText(txtPath, text, System.Text.Encoding.UTF8);
- Console.WriteLine($"生成标注文件:{Path.GetFileName(txtPath)}");
- }
- }
- /// <summary>
- /// 验证训练数据集的完整性
- /// </summary>
- /// <param name="folder">数据集文件夹</param>
- /// <returns>缺失标注文件的图片列表</returns>
- public static List<string> ValidateDataset(string folder)
- {
- var missingLabels = new List<string>();
- var imageFiles = Directory.GetFiles(folder, "*.{png,jpg,jpeg,bmp,tif,tiff}")
- .SelectMany(pattern => Directory.GetFiles(folder, pattern))
- .Distinct()
- .ToArray();
- foreach (var imageFile in imageFiles)
- {
- string txtFile = Path.ChangeExtension(imageFile, ".txt");
-
- if (!File.Exists(txtFile))
- {
- missingLabels.Add(Path.GetFileName(imageFile));
- }
- }
- if (missingLabels.Count > 0)
- {
- Console.WriteLine($"发现 {missingLabels.Count} 个缺少标注文件的图片:");
- foreach (var file in missingLabels)
- {
- Console.WriteLine($" - {file}");
- }
- }
- else
- {
- Console.WriteLine($"数据集完整!共 {imageFiles.Length} 个样本");
- }
- return missingLabels;
- }
- /// <summary>
- /// 统计字符频率(用于生成字符集)
- /// </summary>
- /// <param name="labelFolder">标注文件夹</param>
- /// <returns>字符频率字典</returns>
- public static Dictionary<char, int> AnalyzeCharacterFrequency(string labelFolder)
- {
- var charFrequency = new Dictionary<char, int>();
- var txtFiles = Directory.GetFiles(labelFolder, "*.txt");
- foreach (var txtFile in txtFiles)
- {
- string content = File.ReadAllText(txtFile, System.Text.Encoding.UTF8);
- foreach (char c in content)
- {
- if (charFrequency.ContainsKey(c))
- {
- charFrequency[c]++;
- }
- else
- {
- charFrequency[c] = 1;
- }
- }
- }
- // 按频率排序
- var sorted = charFrequency.OrderByDescending(kvp => kvp.Value).ToDictionary(kvp => kvp.Key, kvp => kvp.Value);
- Console.WriteLine("字符频率统计(前50个):");
- int count = 0;
- foreach (var kvp in sorted.Take(50))
- {
- string displayChar = kvp.Key == ' ' ? "(空格)" : kvp.Key.ToString();
- Console.WriteLine($" '{displayChar}': {kvp.Value} 次");
- count++;
- }
- Console.WriteLine($"\n总字符种类:{sorted.Count}");
- return sorted;
- }
- /// <summary>
- /// 生成训练脚本(Windows批处理)
- /// </summary>
- /// <param name="fontName">字体名称</param>
- /// <param name="outputPath">输出路径</param>
- public static void GenerateTrainingScript(string fontName, string outputPath)
- {
- string script = $@"@echo off
- echo ========================================
- echo Tesseract OCR 训练脚本
- echo 字体名称:{fontName}
- echo ========================================
- echo.
- set LANG={fontName}
- set TESSDATA_PREFIX=%~dp0tessdata
- echo [1/8] 生成box文件...
- tesseract %LANG%.exp0.tif %LANG%.exp0 batch.nochop makebox
- if errorlevel 1 goto error
- echo [2/8] 生成训练文件...
- tesseract %LANG%.exp0.tif %LANG%.exp0 nobatch box.train
- if errorlevel 1 goto error
- echo [3/8] 提取字符集...
- unicharset_extractor %LANG%.exp0.box
- if errorlevel 1 goto error
- echo [4/8] 形状聚类...
- shapeclustering -F unicharset -O unicharset %LANG%.exp0.tr
- if errorlevel 1 goto error
- echo [5/8] MF训练...
- mftraining -F unicharset -U unicharset -O {fontName}.unicharset %LANG%.exp0.tr
- if errorlevel 1 goto error
- echo [6/8] CN训练...
- cntraining %LANG%.exp0.tr
- if errorlevel 1 goto error
- echo [7/8] 重命名文件...
- rename normproto {fontName}.normproto
- rename inttemp {fontName}.inttemp
- rename pffmtable {fontName}.pffmtable
- rename shapetable {fontName}.shapetable
- echo [8/8] 合并训练数据...
- combine_tessdata {fontName}.
- if errorlevel 1 goto error
- echo.
- echo ========================================
- echo 训练完成!
- echo 生成的文件:{fontName}.traineddata
- echo 请将其复制到 tessdata 文件夹
- echo ========================================
- pause
- exit /b 0
- :error
- echo.
- echo 训练失败!错误代码:%errorlevel%
- pause
- exit /b 1
- ";
- File.WriteAllText(outputPath, script, System.Text.Encoding.UTF8);
- Console.WriteLine($"训练脚本已生成:{outputPath}");
- }
- /// <summary>
- /// 创建示例训练项目
- /// </summary>
- /// <param name="projectFolder">项目文件夹</param>
- public static void CreateSampleProject(string projectFolder)
- {
- if (!Directory.Exists(projectFolder))
- {
- Directory.CreateDirectory(projectFolder);
- }
- // 创建子文件夹
- string rawFolder = Path.Combine(projectFolder, "raw_images");
- string processedFolder = Path.Combine(projectFolder, "processed_images");
- string labelsFolder = Path.Combine(projectFolder, "labels");
- Directory.CreateDirectory(rawFolder);
- Directory.CreateDirectory(processedFolder);
- Directory.CreateDirectory(labelsFolder);
- // 创建README
- string readme = $@"# OCR训练项目
- ## 文件夹说明
- - **raw_images/**: 原始训练图片
- - **processed_images/**: 预处理后的图片
- - **labels/**: 标注文件(.txt格式)
- ## 使用步骤
- ### 1. 准备训练数据
- 将训练图片放入 `raw_images` 文件夹
- ### 2. 预处理图片
- ```csharp
- OCRTrainingHelper.PreprocessTrainingImages(
- ""{rawFolder}"",
- ""{processedFolder}"",
- targetWidth: 800
- );
- ```
- ### 3. 创建标注文件
- 为每张图片创建对应的 .txt 文件,内容为识别文本
- ### 4. 验证数据集
- ```csharp
- OCRTrainingHelper.ValidateDataset(""{processedFolder}"");
- ```
- ### 5. 分析字符频率
- ```csharp
- OCRTrainingHelper.AnalyzeCharacterFrequency(""{labelsFolder}"");
- ```
- ### 6. 生成训练脚本
- ```csharp
- OCRTrainingHelper.GenerateTrainingScript(
- ""myfont"",
- ""{projectFolder}/train.bat""
- );
- ```
- ### 7. 执行训练
- 运行 `train.bat` 开始训练
- ## 注意事项
- - 每个样本至少需要100-500张图片
- - 图片应覆盖所有需要识别的字符
- - 标注文件必须与图片同名(仅扩展名不同)
- - 建议使用PNG格式保存预处理后的图片
- ";
- File.WriteAllText(Path.Combine(projectFolder, "README.md"), readme);
- // 生成训练脚本
- GenerateTrainingScript("myfont", Path.Combine(projectFolder, "train.bat"));
- Console.WriteLine($"示例项目已创建:{projectFolder}");
- Console.WriteLine("请按照 README.md 中的说明进行操作");
- }
- }
- }
|