I hv downloded poi jar files from here.
http://www.apache.org/dyn/closer.cgi/poi/release/
changed the name of the directory to poi folder .
Check the following ruby script to get the parsed text .. it uses java interface code in WordSampleReader.java this file should be compiled and should be available under poi folder where all jar files are there.
CONFIG = {}
class WordReader
#include Config
CONFIG['host'] = 'mswin32'
def self.generate_text(filename)
interface_classpath=Dir.getwd+"/poi"
case CONFIG['host']
when /mswin32/
Dir.foreach("poi") do |file|
interface_classpath << ";#{Dir.getwd}/poi/"+file if (file != '.' and file != '..' and file.match(/.jar/))
end
path = "java -cp \"#{interface_classpath}\" WordSampleReader "+filename
else
Dir.foreach(Dir.getwd+"/poi/") do |file|
interface_classpath << ":#{Dir.getwd}/poi/"+file if (file != '.' and file != '..' and file.match(/.jar/))
end
path = "java -cp \"#{interface_classpath}\" WordSampleReader "+filename
end
result = ""
IO.popen(path, "w+b" ) { |x| result= x.read }
result
end
end
puts reader = WordReader.generate_text('poi/test.doc')
Though i dont know java used sample code found in google.. The file called WordSampleReader.java.
The source code file is ...
//package com.informit.poi;
// Import POI classes
import org.apache.poi.poifs.filesystem.*;
import org.apache.poi.hwpf.*;
import org.apache.poi.hwpf.extractor.*;
// Import Java classes
import java.io.*;
import java.util.*;
public class WordSampleReader
{
public static void main( String[] args )
{
if( args.length == 0 )
{
System.out.println( "Usage: WordSampleReader
System.exit( 0 );
}
String filename = args[ 0 ];
try
{
// Create a POI File System object; this is the main class for the POIFS file system
// and it manages the entire lifecycle of the file system
POIFSFileSystem fs = new POIFSFileSystem( new FileInputStream( filename ) );
// Create a document for this file
HWPFDocument doc = new HWPFDocument( fs );
// Create a WordExtractor to read the text of the word document
WordExtractor we = new WordExtractor( doc );
// Extract all paragraphs in the document as strings
String[] paragraphs = we.getParagraphText();
// Output the document
//System.out.println( "Word Document has " + paragraphs.length + " paragraphs" );
// for( int i=0; i
//{
//System.out.println( paragraphs[ i ] );
//}
// output text
System.out.println( we.getText() );
}
catch( Exception e )
{
e.printStackTrace();
}
}
}