网站办公室,科技术语,做网站周记,网站建设百度经验需求#xff1a;识别pdf文件中的中文
根据github项目mymonstercat 改造,先将pdf文件转为png文件存于临时文件夹#xff0c;然后通过RapidOcr转为文字,最后删除临时文件夹
1、引入依赖 dependencygroupIdorg.apache.pdfbox/groupIdartifactId识别pdf文件中的中文
根据github项目mymonstercat 改造,先将pdf文件转为png文件存于临时文件夹然后通过RapidOcr转为文字,最后删除临时文件夹
1、引入依赖 dependencygroupIdorg.apache.pdfbox/groupIdartifactIdfontbox/artifactIdversion3.0.3/version/dependencydependencygroupIdorg.apache.pdfbox/groupIdartifactIdpdfbox/artifactIdversion3.0.3/version/dependency!-- ocr图片识别 --dependencygroupIdio.github.mymonstercat/groupIdartifactIdrapidocr/artifactIdversion0.0.7/version/dependencydependencygroupIdio.github.mymonstercat/groupIdartifactIdrapidocr-onnx-platform/artifactIdversion0.0.7/version/dependency!-- 本地测试可不引 , 服务器部署linux x86架构 下引入 ,其他环境部署可搜maven --dependencygroupIdio.github.mymonstercat/groupIdartifactIdrapidocr-onnx-linux-x86_64/artifactIdversion1.2.2/version/dependency2、工具类
import org.springframework.util.StringUtils;
import com.benjaminwan.ocrlibrary.OcrResult;
import com.benjaminwan.ocrlibrary.TextBlock;import io.github.mymonstercat.Model;
import io.github.mymonstercat.ocr.InferenceEngine;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.springframework.stereotype.Service;import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.file.*;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.ArrayList;
import java.util.Base64;
import java.util.List;
import java.util.UUID;
Service
public class PdfOCRConverter {//临时输出png文件路径private static final String outputDirs D:/pdfToImg/temp/;public static void main(String[] args) throws IOException {ListString fileNameList getWords(D:/Download/123.pdf);for (String fileName : fileNameList) {System.out.println(fileName);}}public static ListString getWords(String pdfFilePath) throws IOException {String outputDir outputDirs UUID.randomUUID().toString().replace(-, );ListString fileNameList convertPdfToImage(pdfFilePath, outputDir);ListString wordsList new ArrayList();for (String fileName : fileNameList) {System.out.println(识别图片fileName);if (StringUtils.isEmpty(fileName)){break;}ListString words runOcr(fileName);for (String word : words) {System.out.println(word);wordsList.add(word);}}deleteDirectory(outputDir);return wordsList;}public static ListString runOcr(String path) {ListString results new ArrayList();InferenceEngine engine InferenceEngine.getInstance(Model.ONNX_PPOCR_V3);OcrResult ocrResult engine.runOcr(path);for (TextBlock textBlock : ocrResult.getTextBlocks()) {results.add(textBlock.getText());}return results;}public static ListString convertPdfToImage(String pdfFilePath, String outputDir) {// 设置DPI越高图片越清晰但文件也会更大int dpi 300;ListString fileNameList new ArrayList();File file new File(pdfFilePath);try (PDDocument document Loader.loadPDF(file)) {PDFRenderer pdfRenderer new PDFRenderer(document);String pdfFileName file.getName().replace(.pdf, );String name pdfFileName;for (int page 0; page document.getNumberOfPages(); page) {BufferedImage bim pdfRenderer.renderImageWithDPI(page, dpi);String folder createFolder(outputDir / name);String fileName folder / pdfFileName _page_ (page 1) .png;ImageIO.write(bim, png, new File(fileName));fileNameList.add(fileName);System.out.println(生成图片fileName);}} catch (IOException e) {e.printStackTrace();}return fileNameList;}public static void deleteDirectory(String path) throws IOException {// 如果路径不指向一个目录则抛出异常Path directory Paths.get(path);if (!Files.isDirectory(directory)) {throw new IOException(The provided path is not a directory.);}// 遍历目录中的所有文件和子目录Files.walkFileTree(directory, new SimpleFileVisitorPath() {Overridepublic FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {// 删除文件Files.delete(file);return FileVisitResult.CONTINUE;}Overridepublic FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException {// 所有内容被删除后删除目录本身Files.delete(dir);return FileVisitResult.CONTINUE;}Overridepublic FileVisitResult visitFileFailed(Path file, IOException exc) throws IOException {// 如果访问文件失败则抛出异常throw exc;}});}public static String createFolder(String folderPath) {String txt folderPath;try {File myFilePath new File(txt);txt folderPath;if (!myFilePath.exists()) {myFilePath.mkdirs();}} catch (Exception e) {e.printStackTrace();}return txt;}public static ListString getWordsByBase64(String base64) throws IOException {ListString words new ArrayList();if (StringUtils.isEmpty(base64)) {return null;}String outputDir outputDirs UUID.randomUUID().toString().replace(-, );// 解码Base64字符串byte[] decodedBytes Base64.getDecoder().decode(base64);createFolder(outputDir);// 输出的PDF文件名String outputFilePath outputDir/output.pdf;try (FileOutputStream fos new FileOutputStream(outputFilePath)) {// 将解码后的字节数组写入文件fos.write(decodedBytes);System.out.println(PDF文件已成功生成: outputFilePath);words getWords(outputFilePath);} catch (Exception e) {e.printStackTrace();}deleteDirectory(outputDir);return words;}}