Porter Stemming Algorithm java code

Stemming is the process for reducing words to their stem. Stemming programs are commonly referred to as stemming algorithms or stemmers. So let us start with  java program for Porter Stemming Algorithm.

Porter Stemming Algorithm in  java

Porter Stemming Algorithm java code

Step 1 :- Open Eclipse IDE, Goto File>> New >>Java Project.

Step 2 :- Now Give file Name as stemming

Step 3 :- Now Right Click on src >> Goto New >> Class

Step 4 :- Give file Name as PorterAlgo and Paste below code

package com.mycompany.algo;

class NewString {
public String str;

NewString() {
str = “”;
}
}

public class PorterAlgo {

String Clean( String str ) {
int last = str.length();

new Character( str.charAt(0) );
String temp = “”;

for ( int i=0; i < last; i++ ) {
if ( Character.isLetterOrDigit( str.charAt(i) ) )
temp += str.charAt(i);
}

return temp;
} //clean

boolean hasSuffix( String word, String suffix, NewString stem ) {

String tmp = “”;

if ( word.length() <= suffix.length() )
return false;
if (suffix.length() > 1)
if ( word.charAt( word.length()-2 ) != suffix.charAt( suffix.length()-2 ) )
return false;

stem.str = “”;

for ( int i=0; i<word.length()-suffix.length(); i++ )
stem.str += word.charAt( i );
tmp = stem.str;

for ( int i=0; i<suffix.length(); i++ )
tmp += suffix.charAt( i );

if ( tmp.compareTo( word ) == 0 )
return true;
else
return false;
}

boolean vowel( char ch, char prev ) {
switch ( ch ) {
case ‘a’: case ‘e’: case ‘i’: case ‘o’: case ‘u’:
return true;
case ‘y’: {

switch ( prev ) {
case ‘a’: case ‘e’: case ‘i’: case ‘o’: case ‘u’:
return false;

default:
return true;
}
}

default :
return false;
}
}

int measure( String stem ) {

int i=0, count = 0;
int length = stem.length();

while ( i < length ) {
for ( ; i < length ; i++ ) {
if ( i > 0 ) {
if ( vowel(stem.charAt(i),stem.charAt(i-1)) )
break;
}
else {
if ( vowel(stem.charAt(i),’a’) )
break;
}
}

for ( i++ ; i < length ; i++ ) {
if ( i > 0 ) {
if ( !vowel(stem.charAt(i),stem.charAt(i-1)) )
break;
}
else {
if ( !vowel(stem.charAt(i),’?’) )
break;
}
}
if ( i < length ) {
count++;
i++;
}
} //while

return(count);
}

boolean containsVowel( String word ) {

for (int i=0 ; i < word.length(); i++ )
if ( i > 0 ) {
if ( vowel(word.charAt(i),word.charAt(i-1)) )
return true;
}
else {
if ( vowel(word.charAt(0),’a’) )
return true;
}

return false;
}

boolean cvc( String str ) {
int length=str.length();

if ( length < 3 )
return false;

if ( (!vowel(str.charAt(length-1),str.charAt(length-2)) )
&& (str.charAt(length-1) != ‘w’) && (str.charAt(length-1) != ‘x’) && (str.charAt(length-1) != ‘y’)
&& (vowel(str.charAt(length-2),str.charAt(length-3))) ) {

if (length == 3) {
if (!vowel(str.charAt(0),’?’))
return true;
else
return false;
}
else {
if (!vowel(str.charAt(length-3),str.charAt(length-4)) )
return true;
else
return false;
}
}

return false;
}

String step1( String str ) {

NewString stem = new NewString();

if ( str.charAt( str.length()-1 ) == ‘s’ ) {
if ( (hasSuffix( str, “sses”, stem )) || (hasSuffix( str, “ies”, stem)) ){
String tmp = “”;
for (int i=0; i<str.length()-2; i++)
tmp += str.charAt(i);
str = tmp;
}
else {
if ( ( str.length() == 1 ) && ( str.charAt(str.length()-1) == ‘s’ ) ) {
str = “”;
return str;
}
if ( str.charAt( str.length()-2 ) != ‘s’ ) {
String tmp = “”;
for (int i=0; i<str.length()-1; i++)
tmp += str.charAt(i);
str = tmp;
}
}
}

if ( hasSuffix( str,”eed”,stem ) ) {
if ( measure( stem.str ) > 0 ) {
String tmp = “”;
for (int i=0; i<str.length()-1; i++)
tmp += str.charAt( i );
str = tmp;
}
}
else {
if ( (hasSuffix( str,”ed”,stem )) || (hasSuffix( str,”ing”,stem )) ) {
if (containsVowel( stem.str )) {

String tmp = “”;
for ( int i = 0; i < stem.str.length(); i++)
tmp += str.charAt( i );
str = tmp;
if ( str.length() == 1 )
return str;

if ( ( hasSuffix( str,”at”,stem) ) || ( hasSuffix( str,”bl”,stem ) ) || ( hasSuffix( str,”iz”,stem) ) ) {
str += “e”;

}
else {
int length = str.length();
if ( (str.charAt(length-1) == str.charAt(length-2))
&& (str.charAt(length-1) != ‘l’) && (str.charAt(length-1) != ‘s’) && (str.charAt(length-1) != ‘z’) ) {

tmp = “”;
for (int i=0; i<str.length()-1; i++)
tmp += str.charAt(i);
str = tmp;
}
else
if ( measure( str ) == 1 ) {
if ( cvc(str) )
str += “e”;
}
}
}
}
}

if ( hasSuffix(str,”y”,stem) )
if ( containsVowel( stem.str ) ) {
String tmp = “”;
for (int i=0; i<str.length()-1; i++ )
tmp += str.charAt(i);
str = tmp + “i”;
}
return str;
}

String step2( String str ) {

String[][] suffixes = { { “ational”, “ate” },
{ “tional”, “tion” },
{ “enci”, “ence” },
{ “anci”, “ance” },
{ “izer”, “ize” },
{ “iser”, “ize” },
{ “abli”, “able” },
{ “alli”, “al” },
{ “entli”, “ent” },
{ “eli”, “e” },
{ “ousli”, “ous” },
{ “ization”, “ize” },
{ “isation”, “ize” },
{ “ation”, “ate” },
{ “ator”, “ate” },
{ “alism”, “al” },
{ “iveness”, “ive” },
{ “fulness”, “ful” },
{ “ousness”, “ous” },
{ “aliti”, “al” },
{ “iviti”, “ive” },
{ “biliti”, “ble” }};
NewString stem = new NewString();
for ( int index = 0 ; index < suffixes.length; index++ ) {
if ( hasSuffix ( str, suffixes[index][0], stem ) ) {
if ( measure ( stem.str ) > 0 ) {
str = stem.str + suffixes[index][1];
return str;
}
}
}

return str;
}

String step3( String str ) {

String[][] suffixes = { { “icate”, “ic” },
{ “ative”, “” },
{ “alize”, “al” },
{ “alise”, “al” },
{ “iciti”, “ic” },
{ “ical”, “ic” },
{ “ful”, “” },
{ “ness”, “” }};
NewString stem = new NewString();

for ( int index = 0 ; index<suffixes.length; index++ ) {
if ( hasSuffix ( str, suffixes[index][0], stem ))
if ( measure ( stem.str ) > 0 ) {
str = stem.str + suffixes[index][1];
return str;
}
}
return str;
}

String step4( String str ) {

String[] suffixes = { “al”, “ance”, “ence”, “er”, “ic”, “able”, “ible”, “ant”, “ement”, “ment”, “ent”, “sion”, “tion”,
“ou”, “ism”, “ate”, “iti”, “ous”, “ive”, “ize”, “ise”};

NewString stem = new NewString();

for ( int index = 0 ; index<suffixes.length; index++ ) {
if ( hasSuffix ( str, suffixes[index], stem ) ) {

if ( measure ( stem.str ) > 1 ) {
str = stem.str;
return str;
}
}
}
return str;
}

String step5( String str ) {

if ( str.charAt(str.length()-1) == ‘e’ ) {
if ( measure(str) > 1 ) {/* measure(str)==measure(stem) if ends in vowel */
String tmp = “”;
for ( int i=0; i<str.length()-1; i++ )
tmp += str.charAt( i );
str = tmp;
}
else
if ( measure(str) == 1 ) {
String stem = “”;
for ( int i=0; i<str.length()-1; i++ )
stem += str.charAt( i );

if ( !cvc(stem) )
str = stem;
}
}

if ( str.length() == 1 )
return str;
if ( (str.charAt(str.length()-1) == ‘l’) && (str.charAt(str.length()-2) == ‘l’) && (measure(str) > 1) )
if ( measure(str) > 1 ) {/* measure(str)==measure(stem) if ends in vowel */
String tmp = “”;
for ( int i=0; i<str.length()-1; i++ )
tmp += str.charAt( i );
str = tmp;
}
return str;
}

String stripPrefixes ( String str) {

String[] prefixes = { “kilo”, “micro”, “milli”, “intra”, “ultra”, “mega”, “nano”, “pico”, “pseudo”};

int last = prefixes.length;
for ( int i=0 ; i<last; i++ ) {
if ( str.startsWith( prefixes[i] ) ) {
String temp = “”;
for ( int j=0 ; j< str.length()-prefixes[i].length(); j++ )
temp += str.charAt( j+prefixes[i].length() );
return temp;
}
}

return str;
}
private String stripSuffixes( String str ) {

str = step1( str );
if ( str.length() >= 1 )
str = step2( str );
if ( str.length() >= 1 )
str = step3( str );
if ( str.length() >= 1 )
str = step4( str );
if ( str.length() >= 1 )
str = step5( str );

return str;
}
public String stripAffixes( String str ) {

str = str.toLowerCase();
str = Clean(str);

if (( str != “” ) && (str.length() > 2)) {
str = stripPrefixes(str);

if (str != “” )
str = stripSuffixes(str);

}

return str;
}

}

Step 5 :- Now again Right Click on src >> Goto New >> Class

Step 6 :- Give file Name as PorterCheck and Paste below code

package com.mycompany.algo;
import java.io.IOException;
import java.util.*;

public class PorterCheck {

public static void main(String args[]) throws IOException{
//stemming the words
ArrayList<String> tok = new ArrayList<String>();
String[] tokens = {“normalize”,”technical”,”education”,”Nilkanth”};
for (String x: tokens){
tok.add(x);
}
System.out.println(completeStem(tok));
}

//method to completely stem the words in an array list
public static ArrayList<String> completeStem(List<String> tokens1){
PorterAlgo pa = new PorterAlgo();
ArrayList<String> arrstr = new ArrayList<String>();
for (String i : tokens1){
String s1 = pa.step1(i);
String s2 = pa.step2(s1);
String s3= pa.step3(s2);
String s4= pa.step4(s3);
String s5= pa.step5(s4);
arrstr.add(s5);
}
return arrstr;
}
public static ArrayList<String> fileTokenizer(){
StringTokenizer strtoken = new StringTokenizer(“this is a book”);
ArrayList<String> filetoken = new ArrayList<String>();
while(strtoken.hasMoreElements()){
filetoken.add(strtoken.nextToken());
}
return filetoken;
}
}

Step 7 :- Click on DeBug and Run the Program.

Nilkanth Shet Shirodkar is the founder & CEO of Redicals. A Software Engineer and a passionate Web developer by heart. He just love – working with computers

Leave a Reply

Your email address will not be published. Required fields are marked *