pdf怎么提取坐标,pdf文件坐标

在常用的java中处理PDF的库有PDFbox和itext。本节介绍如何使用PDFbox和itext提取PDF的字符坐标。

3358 www.Sina.com/http://www.Sina.com/1，定义一个类实现RenderListener，可以用里面的几种方法操作PDF中的文字和图像

import java.awt.Color； import Java.awt.image.buffered image； import java.io.IOException； import java.util.ArrayList； import java.util.HashMap； import java.util.HashSet； import java.util.List； import java.util.Map； import javax.imageio.ImageIO； import com.itext pdf.awt.geom.rectangle 2d； import com.itext pdf.awt.geom.rectangular shape； import com.itext pdf.text.base color； import com.itext pdf.text.rectangle； import com.itext pdf.text.pdf.pdfcontentbyte； import com.itext pdf.text.pdf.parser.imagerenderinfo； import com.itext pdf.text.pdf.parser.render listener； import com.itext pdf.text.pdf.parser.textrenderinfo； publicclasstestrenderlistenerimplementsrenderlistener//要存储文本的矩形list rectangle 2d.floatrecttext=newarraylistrectangle 2d.floatrecttext=newarraylistrectangle 2d.flo ast//文本//用于存储字符的y坐标listfloatlisty=newarraylistfloat (； //用于存储每行文本的坐标位置ListMapString，rectangle 2d.float rows _ text _ rect=new ArrayList (； //PDF文件的路径protected String filepath=null； public TestRenderListener () {}//step 2， ' BT '执行@ overridepublicvoidbegintextblock ((/todo auto-generatedmethodstub )/step3/***字符的主要处理方法)/@ overrridepublidepublintextextextblock /获取文本下的矩形//rectangle 2d.floatrectbase=render info.get base string text=render info.gettext (； text.length (0) rectangularshaperectbase=render info.get baseline ).getBoundingRectange； //文本下的矩形rectangle 2d.floatrectascen=render info.getascentline ().getBoundingRectange ) )； //字符边框矩形float leftX=(float ) rectBase.getMinX (；浮体lefty=(float ) rectbase.getminy(-1； floatrightx=(float ) rectAscen.getMaxX )；浮动权限=(浮动) rectAscen.getMaxY ) ) 1； rectangle 2d.float rect=new rectangle 2d.float (leftx，leftY，rightX - leftX，rightY - leftY )； system.out.println (' text : ' text '-- x : ' rect.x '-- y : ' rect.y '-- width 3360 ' rect.width ' float tempx=rect.xrect text.get (index ).x？ recttext.get(index ).x : rect.x； rectText.s

et(index,new Rectangle2D.Float(tempx,rect.y,rect.width + rectText.get(index).width,rect.height));textList.set(index,textList.get(index) + text);}else{rectText.add(rect);textList.add(text);listY.add(rect.y);}Map<String,Rectangle2D.Float> map = new HashMap<>();map.put(text,rect);rows_text_rect.add(map);}}//step 4(最后执行的，只执行一次)，遇到“ET”执行@Overridepublic void endTextBlock() {// TODO Auto-generated method stub}//step 1(图片处理方法)@Overridepublic void renderImage(ImageRenderInfo renderInfo) {}}2、使用自定义的类来实现获取PDF的文字坐标

PdfReader reader = new PdfReader(pdfPath);//新建一个PDF解析对象PdfReaderContentParser parser = new PdfReaderContentParser(reader);//包含了PDF页面的信息，作为处理的对象PdfStamper stamper = new PdfStamper(reader, new FileOutputStream("d:/test.pdf"));for(int i = 1;i <= reader.getNumberOfPages();i++){//新建一个ImageRenderListener对象，该对象实现了RenderListener接口，作为处理PDF的主要类TestRenderListener listener = new TestRenderListener();//解析PDF，并处理里面的文字parser.processContent(i, listener);//获取文字的矩形边框List<Rectangle2D.Float> rectText = listener.rectText;List<String> textList = listener.textList;List<Float> listY = listener.listY;List<Map<String,Rectangle2D.Float>> list_text = listener.rows_text_rect;for(int k = 0;k < list_text.size();k++){Map<String,Rectangle2D.Float> map = list_text.get(k);for(Map.Entry<String, Rectangle2D.Float>entry:map.entrySet()){System.out.println(entry.getKey()+"---"+entry.getValue());}}}

二、PDFbox获取文字坐标

PDFbox与itext不同的是，PDFbox只能一个一个字的提取PDF的文字坐标，而itext是一段一段提取的。

PDFbox版本：1.8.13，不同版本可能部分代码写法不同。

import java.io.*;import org.apache.pdfbox.exceptions.InvalidPasswordException;import org.apache.pdfbox.pdmodel.PDDocument;import org.apache.pdfbox.pdmodel.PDPage;import org.apache.pdfbox.pdmodel.common.PDStream;import org.apache.pdfbox.util.PDFTextStripper;import org.apache.pdfbox.util.TextPosition;import java.io.IOException;import java.util.ArrayList;import java.util.List;public class PrintTextLocations extends PDFTextStripper {static List<Float> list_postion = new ArrayList<Float>();static List<String> list_text = new ArrayList<String>(); public PrintTextLocations() throws IOException { super.setSortByPosition(true); } public static void main(String[] args) throws Exception { PDDocument document = null; try { File input = new File("D://result.pdf"); document = PDDocument.load(input); if (document.isEncrypted()) { document.decrypt(""); } PrintTextLocations printer = new PrintTextLocations(); List allPages = document.getDocumentCatalog().getAllPages(); for (int i = 0; i < allPages.size(); i++) { PDPage page = (PDPage) allPages.get(i); System.out.println("Processing page: " + i); PDStream contents = page.getContents(); if (contents != null) { printer.processStream(page, page.findResources(), page.getContents().getStream()); } } } finally { if (document != null) { document.close(); } } System.out.println(list_text.size()); for(int i = 0;i < list_text.size();i++){ System.out.println(list_text.get(i) ); } } /** * @param text The text to be processed */ @Override /* this is questionable, not sure if needed... */ protected void processTextPosition(TextPosition text) { System.out.println("String[" + text.getXDirAdj() + "," + text.getYDirAdj() + " fs=" + text.getFontSize() + " xscale=" + text.getXScale()+ " yscale=" + text.getYScale() + " height=" + text.getHeightDir() + " space=" + text.getWidthOfSpace() + " width=" + text.getWidthDirAdj() + " x=" + text.getX() + " y=" + text.getY() + " y1=" + text.getTextPos().getYPosition() + " x1=" + text.getTextPos().getXPosition() + " x1=" + text.getTextPos().getXScale() + " x1=" + text.getTextPos().getYScale() + "]" + text.getCharacter()); }}