Monday, 20 October 2014

Extract the Noun from the Text using NLP

Extract the Noun from the Text using NLP

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;

import edu.stanford.nlp.tagger.maxent.MaxentTagger;

public class GetNoun {
public static void main(String[] args) throws IOException, TikaException {

MaxentTagger tagger = new MaxentTagger("/home/hadoop/work/jars/wsj-0-18-bidirectional-nodistsim.tagger");
String con = new Tika().parseToString(new File("
/home/hadoop/work/text.txt"));
String string = con.replaceAll("[^\\p{L}\\p{Nd}]", " ");
String string2 = string.replaceAll("\\s+", " ");
String lo = string2.toLowerCase();
ArrayList<String> set = new ArrayList<String>();
StringTokenizer tokenizer = new StringTokenizer(lo);
while (tokenizer.hasMoreElements()) {
String word = (String) tokenizer.nextElement();
String tagged = tagger.tagString(word);
Pattern p = Pattern.compile(".*NN");
Matcher m = p.matcher(tagged);
if(m.find()){
String string3=m.group(0);
String noun=string3.replace("_NN", "");
set.add(noun);
}
}
File file = new File("
/home/hadoop/work/filename.txt");
if (!file.exists()) {
file.createNewFile(); }
FileWriter fw = new FileWriter(file.getAbsoluteFile());
BufferedWriter bw = new BufferedWriter(fw);
bw.write(set.toString().replaceAll("\\[", "").replaceAll("\\]", ""));
bw.close();

System.out.println(set.toString().replaceAll("\\[", "").replaceAll("\\]", ""));
}
}
Related Posts Plugin for WordPress, Blogger...