java读取doc文本,java读取doc文件内容

本文目录一览：

1、java读取word文件的内容
2、java读取带格式word内容
3、JAVA有什么好的方法可以将word里的文本读取出来
4、java读取doc,pdf问题。
5、如何使用JAVA，POI读写word文档
6、java如何获得doc文件内容

java读取word文件的内容

WordExtractor的用法错了，你用下面的方法试试：

......

File file = new File(doc);

FileInputStream fileInputStream = getFileInputStream(file);

WordExtractor wordExtractor = new WordExtractor(fileInputStream);

String text = wordExtractor.getText();

......

java读取带格式word内容

用jacob吧。。

/**

*@author eyuan

package per.eyuan.word2txt.core;

import com.jacob.*;

import com.jacob.com.*;

import com.jacob.activeX.*;

import java.io.*;

import java.util.Scanner;

public class Core {

/**

* 实现转换的函数

* @param sourceFilesPath

* @param destinationFilesPath

* @param destinationFilesType

* @return void

* @see import com.jacob.activeX.*;

public static void change(String sourceFilesPath,String destinationFilesPath,int destinationFilesType){

//使用word文件所在的目录（源路径）建立目录文件

File sourcePathFile=new File(sourceFilesPath);

//取得word文件（源文件列表）

File sourceFilesList[]=sourcePathFile.listFiles();

System.out.println("共有"+sourceFilesList.length+"个文件（文件夹）");

//指定要转换的文件所在的目录下，如果有子目录，

//则进入子目录，继续查找word文档并将其转换，

//直到将指定目录下的所有word文档转换完。

//子目录名

String sourceChildPath=new String("");

//保持原来的层次关系，将子目录下的文件存放在新建的子目录中

String destiNationChildPath=new String("");

//检索文件，过滤掉非word文件，通过扩展名过滤

for(int i=0;isourceFilesList.length;i++){

//排除掉子文件夹

if(sourceFilesList[i].isFile()){

System.out.println("第"+(i+1)+"个文件：");

//取得文件全名（包含扩展名）

String fileName=sourceFilesList[i].getName();

String fileType=new String("");

//取得文件扩展名

fileType=fileName.substring((fileName.length()-4), fileName.length());

//word2007-2010扩展名为docx

//判断是否为word2007-2010文档，及是否以docx为后缀名

if(fileType.equals("docx")){

System.out.println("正在转换。。。");

//输出word文档所在路劲

System.out.println("目录："+sourceFilesPath);

//输出word文档名

System.out.println("文件名："+fileName);

//System.out.println(fileName.substring(0, (fileName.length()-5)));

//核心函数

//启动word

ActiveXComponent app=new ActiveXComponent("Word.Application");

//要转换的文档的全路径（所在文件夹+文件全名）

String docPath=sourceFilesPath+"\"+fileName;

//转换后的文档的全路径（所在文件夹+文件名）

String othersPath=destinationFilesPath+"\"+fileName.substring(0,(fileName.length()-5));

String inFile=docPath;

String outFile=othersPath;

boolean flag=false;

//核心代码

try{

//设置word可见性

app.setProperty("Visible", new Variant(false));

Dispatch docs=app.getProperty("Documents").toDispatch();

//打开word文档

Dispatch doc=Dispatch.invoke(docs, "Open", Dispatch.Method, new Object[]{inFile,new Variant(false),new Variant(true)}, new int[1]).toDispatch();

//0:Microsoft Word 97 - 2003 文档 (.doc)

//1:Microsoft Word 97 - 2003 模板 (.dot)

//2:文本文档 (.txt)

//3:文本文档 (.txt)

//4:文本文档 (.txt)

//5:文本文档 (.txt)

//6:RTF 格式 (.rtf)

//7:文本文档 (.txt)

//8:HTML 文档 (.htm)(带文件夹)

//9:MHTML 文档 (.mht)(单文件)

//10:MHTML 文档 (.mht)(单文件)

//11:XML 文档 (.xml)

//12:Microsoft Word 文档 (.docx)

//13:Microsoft Word 启用宏的文档 (.docm)

//14:Microsoft Word 模板 (.dotx)

//15:Microsoft Word 启用宏的模板 (.dotm)

//16:Microsoft Word 文档 (.docx)

//17:PDF 文件 (.pdf)

//18:XPS 文档 (.xps)

//19:XML 文档 (.xml)

//20:XML 文档 (.xml)

//21:XML 文档 (.xml)

//22:XML 文档 (.xml)

//23:OpenDocument 文本 (.odt)

//24:WTF 文件 (.wtf)

//另存为指定格式的文档

Dispatch.invoke(doc, "SaveAs", Dispatch.Method, new Object[]{outFile,new Variant(destinationFilesType)}, new int[1]);

Variant file=new Variant(false);

//关闭文档

Dispatch.call(doc, "Close",file);

flag=true;

}catch(Exception e){

e.printStackTrace();

System.out.println("文档转换失败");

}finally{

app.invoke("Quit",new Variant[]{});

}

System.out.println("转换完毕");

}

//word97-2003扩展名为doc

//判断是否为word2003-2007文档，及是否以doc为后缀名

else if(fileType.equals(".doc")){

System.out.println("正在转换。。。");

//输出word文档所在路劲

System.out.println("目录："+sourceFilesPath);

//输出word文档名

System.out.println("文件名："+fileName);

//System.out.println(fileName.substring(0, (fileName.length()-4)));

//核心函数

//启动word

ActiveXComponent app=new ActiveXComponent("Word.Application");

//要转换的文档的全路径（所在文件夹+文件全名）

String docPath=sourceFilesPath+"\"+fileName;

//转换后的文档的全路径（所在文件夹+文件名）

String othersPath=destinationFilesPath+"\"+fileName.substring(0,(fileName.length()-4));

String inFile=docPath;

String outFile=othersPath;

boolean flag=false;

//核心代码

try{

//设置word可见性

app.setProperty("Visible", new Variant(false));

Dispatch docs=app.getProperty("Documents").toDispatch();

//打开word文档

Dispatch doc=Dispatch.invoke(docs, "Open", Dispatch.Method, new Object[]{inFile,new Variant(false),new Variant(true)}, new int[1]).toDispatch();

//另存为指定格式的文档

Dispatch.invoke(doc, "SaveAs", Dispatch.Method, new Object[]{outFile,new Variant(destinationFilesType)}, new int[1]);

Variant file=new Variant(false);

//关闭文档

Dispatch.call(doc, "Close",file);

flag=true;

}catch(Exception e){

e.printStackTrace();

System.out.println("文档转换失败");

}finally{

app.invoke("Quit",new Variant[]{});

}

System.out.println("转换完毕");

}

//文档的扩展名不是doc或docx

else{

System.out.println("非word文档");

}

//如果是子文件夹，则递归遍历，将所有的word文档转换

else{

sourceChildPath=sourceFilesPath;

//该文件是目录

sourceChildPath=sourceChildPath+"\"+sourceFilesList[i].getName()+"\";

System.out.println("源文件所在路径："+sourceChildPath);

//修改目标文件夹，保持原来的层级关系

destiNationChildPath=destinationFilesPath;

destiNationChildPath=destinationFilesPath+"\"+sourceFilesList[i].getName()+"\";

System.out.println("转换后文件所在路径"+destiNationChildPath);

mkdir(destiNationChildPath);

//递归遍历所有目录，查找word文档，并将其转换

change(sourceChildPath, destiNationChildPath,destinationFilesType);

}

System.out.println("所有文档转换完毕");

}

/**

　* 用于创建文件夹的方法

　* @param mkdirName

public static void mkdir(String mkdirName){

try{

//使用指定的路径创建文件对象

File dirFile = new File(mkdirName);

boolean bFile = dirFile.exists();

//已经存在文件夹，操作？？？提醒是否要替换

if( bFile == true ) {

System.out.println("已经存在文件夹"+mkdirName);

}

//不存在该文件夹，则新建该目录

else{

System.out.println("新建文件夹"+mkdirName);

bFile = dirFile.mkdir();

if( bFile == true ){

System.out.println("文件夹创建成功");

}else{

System.out.println(" 文件夹创建失败，清确认磁盘没有写保护并且空件足够");

System.exit(1);

}

}catch(Exception err){

System.err.println("ELS - Chart : 文件夹创建发生异常");

err.printStackTrace();

}finally{

}

/**

* 判断某个文件夹是否存在

* @param path

public static boolean isPathExist(String path){

boolean isPathExist=false;

try{

File pathFile = new File(path);

if(pathFile.exists())

isPathExist= true;

else

isPathExist= false;

}catch(Exception err){

err.printStackTrace();

}

return isPathExist;

}

/**

* 主函数

public static void main(String[] args){

Scanner sc=new Scanner(System.in);

//源文档所在路径

String sourceFilesPath="";

// String inputSourcePath="";

// boolean sourcePathFlag=true;

// System.out.println("请输入要转换文档所在的文件夹");

// while(sourcePathFlag){

// inputSourcePath=sc.next();

// if(!isPathExist(inputSourcePath))

// System.out.println("源路径不存在，请输入正确的路径");

// else

// sourcePathFlag=false;

// }

// sourceFilesPath=inputSourcePath;

sourceFilesPath="D:\word";

//目标文档要存放的目录

String destinationFilesPath="";

// String inputdestinationPath="";

// boolean destinationPathFlag=true;

// System.out.println("请输入转换后文档要存放的文件夹");

// while(destinationPathFlag){

// inputdestinationPath=sc.next();

// //目标文件不存在时，是否要提示用户创建文件

// if(!isPathExist(inputdestinationPath))

// System.out.println("目标路径不存在，请输入正确的路径");

// else

// destinationPathFlag=false;

// }

// destinationFilesPath=inputdestinationPath;

destinationFilesPath="D:\txt";

//选择要转换的类型

int destinationFilesType=0;

int inputNumber=0;

boolean numFlag=true;

System.out.println("您要将word文档转换为哪种文档格式？");

System.out.println("0:doc t 2:txt t 8:html t 9:htm t 11:xml t 12:docx t 17:pdf t 18:xps");

while(numFlag){

inputNumber=sc.nextInt();

if(inputNumber!=2inputNumber!=8inputNumber!=9inputNumber!=11inputNumber!=12inputNumber!=17){

System.out.println("您的输入有误，请输入要转换的文档类型前的数字");

}else

numFlag=false;

}

destinationFilesType=inputNumber;

//实行转换

change(sourceFilesPath, destinationFilesPath,destinationFilesType);

//测试各种类型转换

// for(int i=0;i25;i++){

// destinationFilesType=i;

// System.out.println("文件类型"+destinationFilesType);

// System.out.println("存放目录："+destinationFilesPath+"\"+i);

// mkdir(destinationFilesPath+"\"+i);

// change(sourceFilesPath, destinationFilesPath+"\"+i,destinationFilesType);

// }

}

这个我刚用的。。格式都能带过来的。你自己再下载个 jacob的包和dll文件

JAVA有什么好的方法可以将word里的文本读取出来

你用免费版的Free Spire.Doc for Java可以直接读取Word文档里面的文本，参考代码：

import com.spire.doc.Document;

import java.io.FileWriter;

import java.io.IOException;

public class ExtractText {

public static void main(String[] args) throws IOException {

//加载Word文档

Document document = new Document();

document.loadFromFile("C:\Users\Administrator\Desktop\sample.docx");

//获取文档中的文本保存为String

String text=document.getText();

//将String写入Txt文件

writeStringToTxt(text,"ExtractedText.txt");

}

public static void writeStringToTxt(String content, String txtFileName) throws IOException {

FileWriter fWriter= new FileWriter(txtFileName,true);

try {

fWriter.write(content);

}catch(IOException ex){

ex.printStackTrace();

}finally{

try{

fWriter.flush();

fWriter.close();

} catch (IOException ex) {

ex.printStackTrace();

}

参考自官网原文。

java读取doc,pdf问题。

PDFBox是一个开源的对pdf文件进行操作的库。 PDFBox-0.7.3.jar加入classpath。同时FontBox1.0.jar加入classpath，否则报错

import java.io.FileInputStream;

import java.io.FileNotFoundException;

import java.io.IOException;

import org.pdfbox.pdfparser.PDFParser;

import org.pdfbox.pdmodel.PDDocument;

import org.pdfbox.util.PDFTextStripper;

public class PdfReader {

/**

* simply reader all the text from a pdf file.

* You have to deal with the format of the output text by yourself.

* 2008-2-25

* @param pdfFilePath file path

* @return all text in the pdf file

public static String getTextFromPDF(String pdfFilePath)

{

String result = null;

FileInputStream is = null;

PDDocument document = null;

try {

is = new FileInputStream(pdfFilePath);

PDFParser parser = new PDFParser(is);

parser.parse();

document = parser.getPDDocument();

PDFTextStripper stripper = new PDFTextStripper();

result = stripper.getText(document);

} catch (FileNotFoundException e) {

// TODO Auto-generated catch block

e.printStackTrace();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

} finally {

if (is != null) {

try {

is.close();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

if (document != null) {

try {

document.close();

} catch (IOException e) {

// TODO Auto-generated catch block

e.printStackTrace();

}

return result;

}

public static void main(String[] args)

{

String str=PdfReader.getTextFromPDF("C:\Read.pdf");

System.out.println(str);

}

代码2：

import java.io.File;

import java.io.FileOutputStream;

import java.io.OutputStreamWriter;

import java.io.Writer;

import java.net.MalformedURLException;

import java.net.URL;

import org.pdfbox.pdmodel.PDDocument;

import org.pdfbox.util.PDFTextStripper;

public class PDFReader {

public void readFdf(String file) throws Exception {

boolean sort = false;

String pdfFile = file;

String textFile = null;

String encoding = "UTF-8";

int startPage = 1;

int endPage = Integer.MAX_VALUE;

Writer output = null;

PDDocument document = null;

try {

// 首先当作一个URL来装载文件，如果得到异常再从本地文件系统//去装载文件

URL url = new URL(pdfFile);

//注意参数已不是以前版本中的URL.而是File。

document = PDDocument.load(pdfFile);

// 获取PDF的文件名

String fileName = url.getFile();

// 以原来PDF的名称来命名新产生的txt文件

if (fileName.length() 4) {

File outputFile = new File(fileName.substring(0, fileName

.length() - 4)

+ ".txt");

textFile = outputFile.getName();

}

} catch (MalformedURLException e) {

// 如果作为URL装载得到异常则从文件系统装载

//注意参数已不是以前版本中的URL.而是File。

document = PDDocument.load(pdfFile);

if (pdfFile.length() 4) {

textFile = pdfFile.substring(0, pdfFile.length() - 4)

+ ".txt";

}

output = new OutputStreamWriter(new FileOutputStream(textFile),

encoding);

PDFTextStripper stripper = null;

stripper = new PDFTextStripper();

// 设置是否排序

stripper.setSortByPosition(sort);

// 设置起始页

stripper.setStartPage(startPage);

// 设置结束页

stripper.setEndPage(endPage);

// 调用PDFTextStripper的writeText提取并输出文本

stripper.writeText(document, output);

} finally {

if (output != null) {

// 关闭输出流

output.close();

}

if (document != null) {

// 关闭PDF Document

document.close();

}

/**

* @param args

public static void main(String[] args) {

// TODO Auto-generated method stub

PDFReader pdfReader = new PDFReader();

try {

// 取得E盘下的SpringGuide.pdf的内容

pdfReader.readFdf("C:\Read.pdf");

} catch (Exception e) {

e.printStackTrace();

}

2、抽取支持中文的pdf文件－xpdf

xpdf是一个开源项目，我们可以调用他的本地方法来实现抽取中文pdf文件。

补丁包：

按照readme放好中文的patch，就可以开始写调用本地方法的java程序了。

下面是一个如何调用的例子：

import java.io.*;

/**

* pTitle: pdf extraction/p

* pDescription: email:chris@matrix.org.cn/p

* pCompany: Matrix.org.cn/p

* @author chris

* @version 1.0,who use this example pls remain the declare

public class PdfWin {

public PdfWin() {

}

public static void main(String args[]) throws Exception

{

String PATH_TO_XPDF="C:Program Filesxpdfpdftotext.exe";

String filename="c:a.pdf";

String[] cmd = new String[] { PATH_TO_XPDF, "-enc", "UTF-8", "-q", filename, "-"};

Process p = Runtime.getRuntime().exec(cmd);

BufferedInputStream bis = new BufferedInputStream(p.getInputStream());

InputStreamReader reader = new InputStreamReader(bis, "UTF-8");

StringWriter out = new StringWriter();

char [] buf = new char[10000];

int len;

while((len = reader.read(buf))= 0) {

//out.write(buf, 0, len);

System.out.println("the length is"+len);

}

reader.close();

String ts=new String(buf);

System.out.println("the str is"+ts);

}

如何使用JAVA，POI读写word文档

public class HwpfTest {

@SuppressWarnings("deprecation")

@Test

public void testReadByExtractor() throws Exception {

InputStream is = new FileInputStream("D:\test.doc");

WordExtractor extractor = new WordExtractor(is);

//输出word文档所有的文本

System.out.println(extractor.getText());

System.out.println(extractor.getTextFromPieces());

//输出页眉的内容

System.out.println("页眉：" + extractor.getHeaderText());

//输出页脚的内容

System.out.println("页脚：" + extractor.getFooterText());

//输出当前word文档的元数据信息，包括作者、文档的修改时间等。

System.out.println(extractor.getMetadataTextExtractor().getText());

//获取各个段落的文本

String paraTexts[] = extractor.getParagraphText();

for (int i=0; iparaTexts.length; i++) {

System.out.println("Paragraph " + (i+1) + " : " + paraTexts[i]);

}

//输出当前word的一些信息

printInfo(extractor.getSummaryInformation());

//输出当前word的一些信息

this.printInfo(extractor.getDocSummaryInformation());

this.closeStream(is);

}

/**

* 输出SummaryInfomation

* @param info

private void printInfo(SummaryInformation info) {

//作者

System.out.println(info.getAuthor());

//字符统计

System.out.println(info.getCharCount());

//页数

System.out.println(info.getPageCount());

//标题

System.out.println(info.getTitle());

//主题

System.out.println(info.getSubject());

}

/**

* 输出DocumentSummaryInfomation

* @param info

private void printInfo(DocumentSummaryInformation info) {

//分类

System.out.println(info.getCategory());

//公司

System.out.println(info.getCompany());

}

/**

* 关闭输入流

* @param is

private void closeStream(InputStream is) {

if (is != null) {

try {

is.close();

} catch (IOException e) {

e.printStackTrace();

}

java如何获得doc文件内容

java 是这样处理的获得一个模板 +数据 = doc文件

反过来 doc文件 + 模板可以得到数据

如果你没有模板，基本上是不行。