public class DocumentExtractor extends AbstractExtractor implements Closeable
Copyright (c) 2020 xsx All Rights Reserved. x-easypdf-pdfbox is licensed under Mulan PSL v2. You can use this software according to the terms and conditions of the Mulan PSL v2. You may obtain a copy of Mulan PSL v2 at: http://license.coscl.org.cn/MulanPSL2 THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. See the Mulan PSL v2 for more details.
| 限定符和类型 | 字段和说明 |
|---|---|
protected AbstractBookmarkExtractor |
bookmarkExtractor
书签提取器
|
protected AbstractCommentExtractor |
commentExtractor
评论提取器
|
protected AbstractFormExtractor |
formExtractor
表单提取器
|
protected AbstractImageExtractor |
imageExtractor
图像提取器
|
protected AbstractTextExtractor |
textExtractor
文本提取器
|
logdocument| 构造器和说明 |
|---|
DocumentExtractor(Document document)
构造方法
|
| 限定符和类型 | 方法和说明 |
|---|---|
void |
close()
关闭
|
Map<Integer,org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem> |
extractBookmark(int... bookmarkIndexes)
提取书签
|
Map<Integer,List<String>> |
extractComment(int... pageIndexes)
提取评论
|
Map<Integer,List<String>> |
extractCommentByRegex(String regex,
int... pageIndexes)
正则提取评论
|
Map<String,org.apache.pdfbox.pdmodel.interactive.form.PDField> |
extractFormField()
表单提取字段
|
Map<String,BufferedImage> |
extractFormImage()
表单提取图像
|
Map<String,String> |
extractFormText()
表单提取文本
|
Map<Integer,List<BufferedImage>> |
extractImage(int... pageIndexes)
提取图像
|
Map<Integer,List<String>> |
extractText(int... pageIndexes)
提取文本
|
Map<Integer,List<String>> |
extractTextByRegex(String regex,
int... pageIndexes)
正则提取文本
|
Map<Integer,Map<String,String>> |
extractTextByRegionArea(Map<String,Rectangle> regionArea,
int... pageIndexes)
区域提取文本
|
Map<Integer,Map<String,String>> |
extractTextByRegionArea(Map<String,Rectangle> regionArea,
String wordSeparator,
int... pageIndexes)
区域提取文本
|
Map<Integer,Map<String,List<List<String>>>> |
extractTextForTable(Map<String,Rectangle> regionArea,
int... pageIndexes)
表格提取文本
注:单行单列
|
Map<Integer,Map<String,List<List<String>>>> |
extractTextForTable(Map<String,Rectangle> regionArea,
String wordSeparator,
int... pageIndexes)
表格提取文本
注:单行单列
|
getDocumentprotected AbstractTextExtractor textExtractor
protected AbstractImageExtractor imageExtractor
protected AbstractFormExtractor formExtractor
protected AbstractCommentExtractor commentExtractor
protected AbstractBookmarkExtractor bookmarkExtractor
public DocumentExtractor(Document document)
document - 文档public Map<Integer,List<String>> extractText(int... pageIndexes)
pageIndexes - 页面索引key=页面索引,value=提取文本
public Map<Integer,List<String>> extractTextByRegex(String regex, int... pageIndexes)
regex - 正则表达式pageIndexes - 页面索引key=页面索引,value=提取文本
public Map<Integer,Map<String,String>> extractTextByRegionArea(Map<String,Rectangle> regionArea, int... pageIndexes)
regionArea - 区域pageIndexes - 页面索引一级,key = 页面索引,value = 提取文本字典
二级,key = 区域名称,value = 提取文本public Map<Integer,Map<String,String>> extractTextByRegionArea(Map<String,Rectangle> regionArea, String wordSeparator, int... pageIndexes)
regionArea - 区域wordSeparator - 单词分隔符pageIndexes - 页面索引一级,key = 页面索引,value = 提取文本字典
二级,key = 区域名称,value = 提取文本public Map<Integer,Map<String,List<List<String>>>> extractTextForTable(Map<String,Rectangle> regionArea, int... pageIndexes)
注:单行单列
regionArea - 区域pageIndexes - 页面索引一级,key = 页面索引,value = 提取文本字典
二级,key = 区域名称,value = 提取文本
public Map<Integer,Map<String,List<List<String>>>> extractTextForTable(Map<String,Rectangle> regionArea, String wordSeparator, int... pageIndexes)
注:单行单列
regionArea - 区域wordSeparator - 单词分隔符pageIndexes - 页面索引一级,key = 页面索引,value = 提取文本字典
二级,key = 区域名称,value = 提取文本
public Map<Integer,List<BufferedImage>> extractImage(int... pageIndexes)
pageIndexes - 页面索引key = 页面索引,value = 提取图像
public Map<String,String> extractFormText()
key = 字段名称,value = 提取文本
public Map<String,BufferedImage> extractFormImage()
key = 字段名称,value = 提取图像
public Map<String,org.apache.pdfbox.pdmodel.interactive.form.PDField> extractFormField()
key = 字段名称,value = 提取字段
public Map<Integer,List<String>> extractComment(int... pageIndexes)
pageIndexes - 页面索引key=页面索引,value=提取评论
public Map<Integer,List<String>> extractCommentByRegex(String regex, int... pageIndexes)
regex - 正则表达式pageIndexes - 页面索引key=页面索引,value=提取评论
public Map<Integer,org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem> extractBookmark(int... bookmarkIndexes)
bookmarkIndexes - 书签索引key=书签索引,value=提取书签
public void close()
close 在接口中 Closeableclose 在接口中 AutoCloseableCopyright © 2024. All rights reserved.