For some odd reason, I end up prototyping different analyzers for PLM space content vs 3ed party analyzers. (Basic need is which got better control on STOP words. At least based on my quick proto type, SOLR got easy constructs.)
Small sample code comparing both analyzers is included.
I did not see much difference for small input text.
public class AnalyzerTest {
private static Analyzer analyzer;
private static long perfTime = 0;
public static void main(String[] args) {
try {
analyzer = new StandardAnalyzer(org.apache.lucene.util.Version.LUCENE_34);
String str = "PLM technology refers to the group of software applications that create and manage the data that define a product and the process for building the product. Beyond just technology, PLM is a discipline that defines best practices for product definition, configuration management, change control, design release, and many other product and process-related procedures.";
perfTime -= System.currentTimeMillis();
displayTokensWithLuceneAnalyzer(analyzer, str);
perfTime += System.currentTimeMillis();
System.out.println("Lucene Analyzer: " + perfTime + " msecs.");
perfTime -= System.currentTimeMillis();
displayTokensWithLingpipeAnalyzer(str);
perfTime += System.currentTimeMillis();
System.out.println("Lingpipe Analyzer: " + perfTime + " msecs.");
} catch (IOException ie) {
System.out.println("IO Error " + ie.getMessage());
}
System.out.println("Time: " + perfTime + " msecs.");
System.out.println("Ended");
}
private static void displayTokensWithLingpipeAnalyzer(String text)
throws IOException {
System.out.println("Inside LingpipeAnalyzer ");
TokenizerFactory ieFactory
= IndoEuropeanTokenizerFactory.INSTANCE;
TokenizerFactory factory
= new EnglishStopTokenizerFactory(ieFactory);
// = new IndoEuropeanTokenizerFactory();
char[] cs =text.toCharArray();
Tokenizer tokenizer = factory.tokenizer(cs, 0, cs.length);
String[] tokens = tokenizer.tokenize();
for (int i = 0; i < tokens.length; i++)
System.out.println(tokens[i]);
System.out.println("Total no. of Tokens: " +tokens.length );
}
private static void displayTokensWithLuceneAnalyzer(Analyzer analyzer, String text)
throws IOException {
System.out.println("Inside LuceneAnalyzer ");
TokenStream tokenStream = analyzer.tokenStream("contents",new StringReader(text) );
OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
int length=0;
while (tokenStream.incrementToken()) {
int startOffset = offsetAttribute.startOffset();
int endOffset = offsetAttribute.endOffset();
String term = charTermAttribute.toString();
System.out.println("term->"+term+ " start:"+startOffset+" end:"+endOffset);
length++;
}
System.out.println("Total no. of Tokens: " + length);
}
}