using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Drawing;
using System.Drawing.Imaging;
using System.IO;
using System.Linq;
using OpenCvSharp;
namespace OCRTest
{
///
/// OCR训练数据准备工具
/// 用于准备和预处理训练图片
///
public class OCRTrainingHelper
{
///
/// 批量预处理训练图片(灰度化、二值化、调整大小)
///
/// 输入文件夹
/// 输出文件夹
/// 目标宽度(像素)
/// 目标高度(像素,0表示自动)
public static void PreprocessTrainingImages(string inputFolder, string outputFolder,
int targetWidth = 800, int targetHeight = 0)
{
if (!Directory.Exists(inputFolder))
{
throw new DirectoryNotFoundException($"输入文件夹不存在:{inputFolder}");
}
if (!Directory.Exists(outputFolder))
{
Directory.CreateDirectory(outputFolder);
}
var imageFiles = Directory.GetFiles(inputFolder, "*.{png,jpg,jpeg,bmp,tif,tiff}")
.SelectMany(pattern => Directory.GetFiles(inputFolder, pattern))
.Distinct()
.ToArray();
Console.WriteLine($"找到 {imageFiles.Length} 张图片");
int successCount = 0;
int failCount = 0;
foreach (var inputFile in imageFiles)
{
try
{
string fileName = Path.GetFileNameWithoutExtension(inputFile);
string outputFile = Path.Combine(outputFolder, $"{fileName}.png");
PreprocessSingleImage(inputFile, outputFile, targetWidth, targetHeight);
successCount++;
Console.WriteLine($"[{successCount + failCount}/{imageFiles.Length}] 处理成功:{fileName}");
}
catch (Exception ex)
{
failCount++;
Console.WriteLine($"处理失败:{Path.GetFileName(inputFile)} - {ex.Message}");
}
}
Console.WriteLine($"\n处理完成!成功:{successCount},失败:{failCount}");
}
///
/// 预处理单张图片
///
private static void PreprocessSingleImage(string inputFile, string outputFile,
int targetWidth, int targetHeight)
{
using (var src = Cv2.ImRead(inputFile, ImreadModes.Color))
{
if (src.Empty())
{
throw new Exception("无法读取图片");
}
Mat processed = src.Clone();
// 1. 调整大小
if (targetWidth > 0 || targetHeight > 0)
{
double scale;
if (targetHeight == 0)
{
scale = (double)targetWidth / src.Width;
targetHeight = (int)(src.Height * scale);
}
else if (targetWidth == 0)
{
scale = (double)targetHeight / src.Height;
targetWidth = (int)(src.Width * scale);
}
else
{
scale = Math.Min((double)targetWidth / src.Width, (double)targetHeight / src.Height);
targetWidth = (int)(src.Width * scale);
targetHeight = (int)(src.Height * scale);
}
Cv2.Resize(processed, processed, new OpenCvSharp.Size(targetWidth, targetHeight));
}
// 2. 转换为灰度图
if (processed.Channels() > 1)
{
Cv2.CvtColor(processed, processed, ColorConversionCodes.BGR2GRAY);
}
// 3. 降噪(高斯模糊)
Cv2.GaussianBlur(processed, processed, new OpenCvSharp.Size(3, 3), 0);
// 4. 自适应二值化
Cv2.AdaptiveThreshold(processed, processed, 255,
AdaptiveThresholdTypes.GaussianC, ThresholdTypes.Binary, 11, 2);
// 5. 保存为PNG(无损压缩)
Cv2.ImWrite(outputFile, processed);
}
}
///
/// 生成训练样本的标注文件
///
/// 图片文件夹
/// 标签字典(文件名 -> 文本内容)
public static void GenerateLabelFiles(string imageFolder, Dictionary labels)
{
foreach (var kvp in labels)
{
string imageName = kvp.Key;
string text = kvp.Value;
// 支持多种图片格式
string[] extensions = { ".png", ".jpg", ".jpeg", ".bmp", ".tif", ".tiff" };
string imagePath = null;
foreach (var ext in extensions)
{
string path = Path.Combine(imageFolder, imageName + ext);
if (File.Exists(path))
{
imagePath = path;
break;
}
}
if (imagePath == null)
{
Console.WriteLine($"警告:找不到图片文件 {imageName}");
continue;
}
// 生成对应的txt文件
string txtPath = Path.ChangeExtension(imagePath, ".txt");
File.WriteAllText(txtPath, text, System.Text.Encoding.UTF8);
Console.WriteLine($"生成标注文件:{Path.GetFileName(txtPath)}");
}
}
///
/// 验证训练数据集的完整性
///
/// 数据集文件夹
/// 缺失标注文件的图片列表
public static List ValidateDataset(string folder)
{
var missingLabels = new List();
var imageFiles = Directory.GetFiles(folder, "*.{png,jpg,jpeg,bmp,tif,tiff}")
.SelectMany(pattern => Directory.GetFiles(folder, pattern))
.Distinct()
.ToArray();
foreach (var imageFile in imageFiles)
{
string txtFile = Path.ChangeExtension(imageFile, ".txt");
if (!File.Exists(txtFile))
{
missingLabels.Add(Path.GetFileName(imageFile));
}
}
if (missingLabels.Count > 0)
{
Console.WriteLine($"发现 {missingLabels.Count} 个缺少标注文件的图片:");
foreach (var file in missingLabels)
{
Console.WriteLine($" - {file}");
}
}
else
{
Console.WriteLine($"数据集完整!共 {imageFiles.Length} 个样本");
}
return missingLabels;
}
///
/// 统计字符频率(用于生成字符集)
///
/// 标注文件夹
/// 字符频率字典
public static Dictionary AnalyzeCharacterFrequency(string labelFolder)
{
var charFrequency = new Dictionary();
var txtFiles = Directory.GetFiles(labelFolder, "*.txt");
foreach (var txtFile in txtFiles)
{
string content = File.ReadAllText(txtFile, System.Text.Encoding.UTF8);
foreach (char c in content)
{
if (charFrequency.ContainsKey(c))
{
charFrequency[c]++;
}
else
{
charFrequency[c] = 1;
}
}
}
// 按频率排序
var sorted = charFrequency.OrderByDescending(kvp => kvp.Value).ToDictionary(kvp => kvp.Key, kvp => kvp.Value);
Console.WriteLine("字符频率统计(前50个):");
int count = 0;
foreach (var kvp in sorted.Take(50))
{
string displayChar = kvp.Key == ' ' ? "(空格)" : kvp.Key.ToString();
Console.WriteLine($" '{displayChar}': {kvp.Value} 次");
count++;
}
Console.WriteLine($"\n总字符种类:{sorted.Count}");
return sorted;
}
///
/// 生成训练脚本(Windows批处理)
///
/// 字体名称
/// 输出路径
public static void GenerateTrainingScript(string fontName, string outputPath)
{
string script = $@"@echo off
echo ========================================
echo Tesseract OCR 训练脚本
echo 字体名称:{fontName}
echo ========================================
echo.
set LANG={fontName}
set TESSDATA_PREFIX=%~dp0tessdata
echo [1/8] 生成box文件...
tesseract %LANG%.exp0.tif %LANG%.exp0 batch.nochop makebox
if errorlevel 1 goto error
echo [2/8] 生成训练文件...
tesseract %LANG%.exp0.tif %LANG%.exp0 nobatch box.train
if errorlevel 1 goto error
echo [3/8] 提取字符集...
unicharset_extractor %LANG%.exp0.box
if errorlevel 1 goto error
echo [4/8] 形状聚类...
shapeclustering -F unicharset -O unicharset %LANG%.exp0.tr
if errorlevel 1 goto error
echo [5/8] MF训练...
mftraining -F unicharset -U unicharset -O {fontName}.unicharset %LANG%.exp0.tr
if errorlevel 1 goto error
echo [6/8] CN训练...
cntraining %LANG%.exp0.tr
if errorlevel 1 goto error
echo [7/8] 重命名文件...
rename normproto {fontName}.normproto
rename inttemp {fontName}.inttemp
rename pffmtable {fontName}.pffmtable
rename shapetable {fontName}.shapetable
echo [8/8] 合并训练数据...
combine_tessdata {fontName}.
if errorlevel 1 goto error
echo.
echo ========================================
echo 训练完成!
echo 生成的文件:{fontName}.traineddata
echo 请将其复制到 tessdata 文件夹
echo ========================================
pause
exit /b 0
:error
echo.
echo 训练失败!错误代码:%errorlevel%
pause
exit /b 1
";
File.WriteAllText(outputPath, script, System.Text.Encoding.UTF8);
Console.WriteLine($"训练脚本已生成:{outputPath}");
}
///
/// 创建示例训练项目
///
/// 项目文件夹
public static void CreateSampleProject(string projectFolder)
{
if (!Directory.Exists(projectFolder))
{
Directory.CreateDirectory(projectFolder);
}
// 创建子文件夹
string rawFolder = Path.Combine(projectFolder, "raw_images");
string processedFolder = Path.Combine(projectFolder, "processed_images");
string labelsFolder = Path.Combine(projectFolder, "labels");
Directory.CreateDirectory(rawFolder);
Directory.CreateDirectory(processedFolder);
Directory.CreateDirectory(labelsFolder);
// 创建README
string readme = $@"# OCR训练项目
## 文件夹说明
- **raw_images/**: 原始训练图片
- **processed_images/**: 预处理后的图片
- **labels/**: 标注文件(.txt格式)
## 使用步骤
### 1. 准备训练数据
将训练图片放入 `raw_images` 文件夹
### 2. 预处理图片
```csharp
OCRTrainingHelper.PreprocessTrainingImages(
""{rawFolder}"",
""{processedFolder}"",
targetWidth: 800
);
```
### 3. 创建标注文件
为每张图片创建对应的 .txt 文件,内容为识别文本
### 4. 验证数据集
```csharp
OCRTrainingHelper.ValidateDataset(""{processedFolder}"");
```
### 5. 分析字符频率
```csharp
OCRTrainingHelper.AnalyzeCharacterFrequency(""{labelsFolder}"");
```
### 6. 生成训练脚本
```csharp
OCRTrainingHelper.GenerateTrainingScript(
""myfont"",
""{projectFolder}/train.bat""
);
```
### 7. 执行训练
运行 `train.bat` 开始训练
## 注意事项
- 每个样本至少需要100-500张图片
- 图片应覆盖所有需要识别的字符
- 标注文件必须与图片同名(仅扩展名不同)
- 建议使用PNG格式保存预处理后的图片
";
File.WriteAllText(Path.Combine(projectFolder, "README.md"), readme);
// 生成训练脚本
GenerateTrainingScript("myfont", Path.Combine(projectFolder, "train.bat"));
Console.WriteLine($"示例项目已创建:{projectFolder}");
Console.WriteLine("请按照 README.md 中的说明进行操作");
}
}
}