// Copyright (c) 2004-2005 AUEB

/**
 * Class that implements PORTER Stemmer.
 * Original notes on the algorithm at website:
 * http://www.tartarus.org/~martin/PorterStemmer
 *
 * A consonant will be denoted by c, a vowel by v.
 *
 * Any word, or part of a word, therefore has the following form: [C](VC)^m[V],
 * where m will be called the measure.
 *
 * The rules for removing a suffix will be given in the form: (condition) S1 -> S2
 * This means that if a word ends with the suffix S1, and the stem before S1 satisfies the given condition,
 * S1 is replaced by S2. The condition is usually given in terms of m.
 *
 * The `condition' part may also contain the following:
 * (1) *S - the stem ends with S (and similarly for the other letters).
 * (2) *v* - the stem contains a vowel.
 * (3) *d - the stem ends with a double consonant.
 * (4) *o - the stem ends cvc, where the second c is not W, X or Y.
 *
 *
 * @author Semi Koen (3010051) -&- Dimitris Lamprou (3010047)
 */


public class PorterStemmer
{
    private String stemmedWord;


    
    public PorterStemmer()
    {}
        
    
    /**
    * Default Constructor. It implements the stemming.
    * @param word String representing the word to be stemmed.
    */
    public String getStemmedWord(String word)
    {
        stemmedWord = word;
        boolean goToStem = true;

        if (stemmedWord.length() > 2)
        {
            //check if word consists of letters (not digits or special characters)
            char[] cArr = word.toCharArray();
            
            for (int i=0; i<cArr.length; i++)
                if (!Character.isLetter(cArr[i]))
                {
                    goToStem = false;
                    break;
                }
        }
        else
            goToStem = false;

        if (goToStem)
            stemString();
        
        return stemmedWord;        
    }   

//--------------------- stemString ---------------------//

    /**
    * It does the actual stemming by calling step-methods.
    */
    private void stemString()
    {
        stemmedWord = step1a(stemmedWord);
        stemmedWord = step1b(stemmedWord);
        stemmedWord = step1c(stemmedWord);
        stemmedWord = step2(stemmedWord);
        stemmedWord = step3(stemmedWord);
        stemmedWord = step4(stemmedWord);
        stemmedWord = step5a(stemmedWord);
        stemmedWord = step5b(stemmedWord);
    }


//--------------------- STEP 1 ---------------------//

    /**
    * Step 1a of the stemmer.
    * It deals with the final -s of a word.
    * @param str String representing the up-to-now word.
    */
    private String step1a(String str)
    {
        if (str.endsWith("s"))
        {
            //SSES -> SS (eg. caresses -> caress)
            if (str.endsWith("sses"))
                return str.substring(0, str.length() - 2);
            //IES -> I (eg. ponies -> poni)
            else if (str.endsWith("ies"))
                return str.substring(0, str.length() - 2);
            //SS -> SS (eg. caress -> caress)
            else if (str.endsWith("ss"))
                return str;
            //S -> (eg. cats -> cat)
            else //if (str.endsWith("s"))
                return str.substring(0, str.length() - 1);
        }
        else
            return str;
    }


    /**
    * Step 1b of the stemmer.
    * It deals with the gerund (-ing) and the past participle (-ed).
    * @param str String representing the up-to-now word.
    */
    private String step1b(String str)
    {
        String result = str;
        
        if (str.endsWith("ed"))
        {
            //(m > 0) EED -> EE (eg. feed -> feed, agreed -> agree)
            if (str.endsWith("eed") && stringMeasure(str.substring(0, str.length() - 3)) > 0)
                result = str.substring(0, str.length() - 1);
            //(*v*) ED -> (eg. plastered -> plaster, bled -> bled)
            else if (str.endsWith("ed") && containsVowel(str.substring(0, str.length() - 2)))
                result = step1b2(str.substring(0, str.length() - 2));
        }
        //(*v*) ING -> (eg. motoring -> motor, sing -> sing)
        else if (str.endsWith("ing") && containsVowel(str.substring(0, str.length() - 3)))
            result = step1b2(str.substring(0, str.length() - 3));

        return result;
    }


    /**
    * Step 1b (cont.) of the stemmer.
    * If the second or third of the rules in Step 1b is successful, this function is called.
    * @param str String representing the up-to-now word.
    */
    private String step1b2(String str)
    {
        String result = str;

        //AT -> ATE (eg. conflat(ed) -> conflate)
        //BL -> BLE (eg. troubl(ed) -> trouble)
        //IZ -> IZE (eg. siz(ed) -> size)
        //This -e may be removed in step 4.
        if (str.endsWith("at") || str.endsWith("bl") || str.endsWith("iz"))
            result = str + "e";
        //(*d and not (*L or *S or *Z)) -> single letter
        //(eg. hopp(ing) -> hop, tann(ed) -> tan, fall(ing) -> fall,
        //hiss(ing) -> hiss, fizz(ed) -> fizz)
        else if (endsWithDoubleConsonant(str) &&
                !(str.endsWith("l") || str.endsWith("s") || str.endsWith("z")))
            result = str.substring(0, str.length() - 1);
        //(eg. fail(ing) ->  fail, fil(ing) -> file)
        else if ( stringMeasure(str) == 1 && endsWithCVC(str) )
            result = str + "e";

        return result;
    }


    /**
    * Step 1c of the stemmer.
    * It deals with words ending with -y.
    * @param str String representing the up-to-now word.
    */
    private String step1c(String str)
    {
        //(*v*) Y -> I (eg. happy -> happi, sky -> sky)
        if (str.endsWith("y") && containsVowel(str.substring(0, str.length() - 1)))
            return str.substring(0, str.length() - 1) + "i";
        else
            return str;
    }


//--------------------- STEP 2 ---------------------//

    /**
    * Step 2 of the stemmer.
    * Its function is straightforward.
    * The test for the string str is done with a "switch" on the penultimate letter.
    * @param str String representing the up-to-now word.
    */
    private String step2(String str)
    {
        String result = str;

        if (str.length() > 2)
        {
            switch (str.charAt(str.length() - 2))
            {
            case 'a':
                //(m > 0) ATIONAL -> ATE (eg. relational -> relate)
                if (str.endsWith("ational") && stringMeasure(str.substring(0, str.length() - 5)) > 0)
                    result = str.substring(0, str.length() - 5) + "e";
                //(m > 0) TIONAL -> TION (eg. conditional -> condition, rational -> rational)
                else if (str.endsWith("tional") && stringMeasure(str.substring(0, str.length() - 2)) > 0)
                    result = str.substring(0, str.length() - 2);
                break;
            case 'c':
                //(m > 0) ENCI -> ENCE (eg. valenci -> valence)
                if (str.endsWith("enci") && stringMeasure(str.substring(0, str.length() - 1)) > 0)
                    result = str.substring(0, str.length() - 1) + "e";
                //(m > 0) ANCI -> ANCE (eg. hesitanci -> hesitance)
                else if (str.endsWith("anci") && stringMeasure(str.substring(0, str.length() - 1)) > 0)
                    result = str.substring(0, str.length() - 1) + "e";
                break;
            case 'e':
                //(m > 0) IZER -> IZE (eg. digitizer -> digitize)
                if (str.endsWith("izer") && stringMeasure(str.substring(0, str.length() - 1)) > 0)
                    result = str.substring(0, str.length() - 1);
                break;
            case 'l':
                //(m > 0) ABLI -> ABLE (eg. conformabli -> conformable)
                if (str.endsWith("abli") && stringMeasure(str.substring(0, str.length() - 1)) > 0)
                    result = str.substring(0, str.length() - 1) + "e";
                //(m > 0) ALLI -> AL (eg. radicalli -> radical)
                else if (str.endsWith("alli") && stringMeasure(str.substring(0, str.length() - 2)) > 0)
                    result = str.substring(0, str.length() - 2);
                //(m > 0) ENTLI -> ENT (eg. differentli -> different)
                else if (str.endsWith("entli") && stringMeasure(str.substring(0, str.length() - 2)) > 0)
                    result = str.substring(0, str.length() - 2);
                //(m > 0) ELI -> E (eg. vileli - > vile)
                else if (str.endsWith("eli") && stringMeasure(str.substring(0, str.length() - 2)) > 0)
                    result = str.substring(0, str.length() - 2);
                //(m > 0) OUSLI -> OUS (eg. analogousli -> analogous)
                else if (str.endsWith("ousli") && stringMeasure(str.substring(0, str.length() - 2)) > 0)
                    result = str.substring(0, str.length() - 2);
                break;
            case 'o':
                //(m > 0) IZATION -> IZE (eg. vietnamization -> vietnamize)
                if (str.endsWith("ization") && stringMeasure(str.substring(0, str.length() - 5)) > 0)
                    result = str.substring(0, str.length() - 5) + "e";
                //(m > 0) ATION -> ATE (eg. predication -> predicate)
                else if (str.endsWith("ation") && stringMeasure(str.substring(0, str.length() - 3)) > 0)
                    result = str.substring(0, str.length() - 3) + "e";
                //(m > 0) ATOR -> ATE (eg. operator -> operate)
                else if (str.endsWith("ator") && stringMeasure(str.substring(0, str.length() - 2)) > 0)
                    result = str.substring(0, str.length() - 2) + "e";
                break;
            case 's':
                //(m > 0) ALISM -> AL (eg. feudalism -> feudal)
                if (str.endsWith("alism") && stringMeasure(str.substring(0, str.length() - 3)) > 0)
                    result = str.substring(0, str.length() - 3);
                //(m > 0) IVENESS -> IVE (eg. decisiveness -> decisive)
                else if (str.endsWith("iveness") && stringMeasure(str.substring(0, str.length() - 4)) > 0)
                    result = str.substring(0, str.length() - 4);
                //(m > 0) FULNESS -> FUL (eg. hopefulness -> hopeful)
                else if (str.endsWith("fulness") && stringMeasure(str.substring(0, str.length() - 4)) > 0)
                    result = str.substring(0, str.length() - 4);
                // (m > 0) OUSNESS -> OUS (eg. callousness -> callous)
                else if (str.endsWith("ousness") && stringMeasure(str.substring(0, str.length() - 4)) > 0)
                    result = str.substring(0, str.length() - 4);
                break;
            case 't':
                //(m > 0) ALITII -> AL (eg. formaliti -> formal)
                if (str.endsWith("aliti") && stringMeasure(str.substring(0, str.length() - 3)) > 0)
                    result = str.substring(0, str.length() - 3);
                //(m > 0) IVITI -> IVE (eg. sensitiviti -> sensitive)
                else if (str.endsWith("iviti") && stringMeasure(str.substring(0, str.length() - 3)) > 0)
                    result = str.substring(0, str.length() - 3) + "e";
                //(m > 0) BILITI -> BLE (eg. sensibiliti -> sensible)
                else if (str.endsWith("biliti") && stringMeasure(str.substring(0, str.length() - 5)) > 0)
                    result = str.substring(0, str.length() - 5) + "le";
                break;
            }
        }//if

        return result;
    }


//--------------------- STEP 3 ---------------------//

    /**
    * Step 3 of the stemmer.
    * Its function is straightforward.
    * The test for the string str is done with a "switch" on the last letter.
    * @param str String representing the up-to-now word.
    */
    private String step3(String str)
    {
        String result = str;

        if (str.length() > 1)
        {

            switch (str.charAt(str.length() - 1))
            {
            case 'e':
                //(m > 0) ICATE -> IC (eg. triplicate -> triplic)
                if (str.endsWith("icate") && stringMeasure(str.substring(0, str.length() - 3)) > 0)
                    result = str.substring(0, str.length() - 3);
                //(m > 0) ATIVE -> (eg. formative -> form)
                else if (str.endsWith("ative") && stringMeasure(str.substring(0, str.length() - 5)) > 0)
                    result = str.substring(0, str.length() - 5);
                //(m > 0) ALIZE -> AL (eg. formalize -> formal)
                else if (str.endsWith("alize") && stringMeasure(str.substring(0, str.length() - 3)) > 0)
                    result = str.substring(0, str.length() - 3);
                break;
            case 'i':
                //(m > 0) ICITI -> IC (eg. electriciti -> electric)
                if (str.endsWith("iciti") && stringMeasure(str.substring(0, str.length() - 3)) > 0)
                    result = str.substring(0, str.length() - 3);
                break;
            case 'l':
                //(m > 0) ICAL -> IC (eg. electrical -> electric)
                if (str.endsWith("ical") && stringMeasure(str.substring(0, str.length() - 2)) > 0)
                    result = str.substring(0, str.length() - 2);
                //(m > 0) FUL -> (eg. hopeful -> hope)
                else if (str.endsWith("ful") && stringMeasure(str.substring(0, str.length() - 3)) > 0)
                    result = str.substring(0, str.length() - 3);
                break;
            case 's':
                 //(m > 0) NESS -> (eg. goodness -> good)
                if (str.endsWith("ness") && stringMeasure(str.substring(0, str.length() - 4)) > 0)
                    result = str.substring(0, str.length() - 4);
                break;
            }
        }//if

        return result;
    }


//--------------------- STEP 4 ---------------------//

    /**
    * Step 4 of the stemmer.
    * Its function is straightforward.
    * The test for the string str is done with a "switch" on the penultimate letter.
    * @param str String representing the up-to-now word.
    */
    private String step4(String str)
    {
        String result = str;

        if (str.length() > 2)
        {

            switch (str.charAt(str.length() - 2))
            {
            case 'a':
                //(m > 1) AL -> (eg. revival ->  reviv)
                if (str.endsWith("al") && stringMeasure(str.substring(0, str.length() - 2)) > 1)
                    result = str.substring(0, str.length() - 2);
                break;
            case 'c':
                //(m > 1) ANCE -> (eg. allowance -> allow)
                if (str.endsWith("ance") && stringMeasure(str.substring(0, str.length() - 4)) > 1)
                    result = str.substring(0, str.length() - 4);
                //(m > 1) ENCE -> (eg. inference -> infer)
                else if (str.endsWith("ence") && stringMeasure(str.substring(0, str.length() - 4)) > 1)
                    result = str.substring(0, str.length() - 4);
                break;
            case 'e':
                //(m > 1) ER -> (eg. airliner -> airlin)
                if (str.endsWith("er") && stringMeasure(str.substring(0, str.length() - 2)) > 1)
                    result = str.substring(0, str.length() - 2);
                break;
            case 'i':
                //(m > 1) IC -> (eg. gyroscopic -> gyroscop)
                if (str.endsWith("ic") && stringMeasure(str.substring(0, str.length() - 2)) > 1)
                    result = str.substring(0, str.length() - 2);
                break;
            case 'l':
                //(m > 1) ABLE -> (eg. adjustable -> adjust)
                if (str.endsWith("able") && stringMeasure(str.substring(0, str.length() - 4)) > 1)
                    result = str.substring(0, str.length() - 4);
                //(m > 1) IBLE -> (eg. defensible -> defens)
                else if (str.endsWith("ible") && stringMeasure(str.substring(0, str.length() - 4)) > 1)
                    result = str.substring(0, str.length() - 4);
                break;
            case 'n':
                //(m > 1) ANT -> (eg. irritant -> irrit)
                if (str.endsWith("ant") && stringMeasure(str.substring(0, str.length() - 3)) > 1)
                    result = str.substring(0, str.length() - 3);
                //(m > 1) EMENT -> (eg. replacement -> replac)
                else if (str.endsWith("ement") && stringMeasure(str.substring(0, str.length() - 5)) > 1)
                    result = str.substring(0, str.length() - 5);
                //(m > 1) MENT -> (eg. adjustment -> adjust)
                else if (str.endsWith("ment") && stringMeasure(str.substring(0, str.length() - 4)) > 1)
                    result = str.substring(0, str.length() - 4);
                //(m > 1) ENT -> (eg. dependent -> depend)
                else if (str.endsWith("ent") && stringMeasure(str.substring(0, str.length() - 3)) > 1)
                    result = str.substring(0, str.length() - 3);
                break;
            case 'o':
                //(m > 1) and (*S or *T) ION -> (eg. adoption -> adopt)
                if ((str.endsWith("sion") || str.endsWith("tion")) && stringMeasure(str.substring(0, str.length() - 3)) > 1)
                    result = str.substring(0, str.length() - 3);
                //(m > 1) OU -> (eg. homologou -> homolog)
                else if (str.endsWith("ou") && stringMeasure(str.substring(0, str.length() - 2)) > 1)
                    result = str.substring(0, str.length() - 2);
                break;
            case 's':
                //(m > 1) ISM -> (eg. communism -> commun)
                if (str.endsWith("ism") && stringMeasure(str.substring(0, str.length() - 3)) > 1)
                    result = str.substring(0, str.length() - 3);
                break;
            case 't':
                //(m > 1) ATE -> (eg. activate -> activ)
                if (str.endsWith("ate") && stringMeasure(str.substring(0, str.length() - 3)) > 1)
                    result = str.substring(0, str.length() - 3);
                //(m > 1) ITI -> (eg. angulariti -> angular)
                else if (str.endsWith("iti") && stringMeasure(str.substring(0, str.length() - 3)) > 1)
                    result = str.substring(0, str.length() - 3);
                break;
            case 'u':
                //(m > 1) OUS -> (eg. homologous -> homolog)
                if (str.endsWith("ous") && stringMeasure(str.substring(0, str.length() - 3)) > 1)
                    result = str.substring(0, str.length() - 3);
                break;
            case 'v':
                //(m > 1) IVE -> (eg. effective -> effect)
                if (str.endsWith("ive") && stringMeasure(str.substring(0, str.length() - 3)) > 1)
                    result = str.substring(0, str.length() - 3);
                break;
            case 'z':
                //(m > 1) IZE -> (eg. bowdlerize -> bowdler)
                if (str.endsWith("ize") && stringMeasure(str.substring(0, str.length() - 3)) > 1)
                    result = str.substring(0, str.length() - 3);
                break;
            }
        }//if

        return result;
    }



//--------------------- STEP 5 ---------------------//

    /**
    * Step 5a of the stemmer.
    * It deals with words ending with -e.
    * @param str String representing the up-to-now word.
    */
    private String step5a(String str)
    {
        String result = str;

        if (str.endsWith("e"))
        {
            //(m > 1) E -> (eg. probate -> probat, rate -> rate)
            if (stringMeasure(str.substring(0, str.length() - 1)) > 1)
                result = str.substring(0, str.length() -1);
            //(m = 1 and not *0) E -> (eg. cease -> ceas)
            else if (stringMeasure(str.substring(0, str.length() - 1)) == 1 &&
                 !endsWithCVC(str.substring(0, str.length() - 1)))
                 result = str.substring(0, str.length() - 1);
        }

        return result;
    }


    /**
    * Step 5b of the stemmer.
    * It deals with words ending with double consonant L.
    * @param str String representing the up-to-now word.
    */
    private String step5b(String str)
    {
        //(m > 1 and *d and *L) ->
        if (str.endsWith("l") && endsWithDoubleConsonant(str) &&
            stringMeasure(str.substring(0, str.length() - 1)) > 1)
                return str.substring(0, str.length() - 1);
        else
            return str;
    }

    
//--------------------- Utility Methods ---------------------//

    /**
    * It returns true if a string contains a vowel.
    * It returns false otherwise.
    * @param str String representing the word we want to know whether or not it contains a vowel.
    */
    private boolean containsVowel(String str)
    {
        for (int i=0; i<str.length(); i++)
            if (isVowel(str, i))
                return true;
        
        return false;
    }

    /**
    * It returns true if a char is a vowel.
    * It returns false otherwise.
    * @param str String representing the word we want to process.
    * @param i Integer representing the i-th character of the string.
    */
    private boolean isVowel(String str, int i)
    {
        return !isConsonant(str, i);
    }

    /**
    * It returns true if a char is a consonant.
    * It returns false otherwise.
    * A consonant in a word is a letter other than A, E, I, O or U,
    * and other than Y preceded by a consonant.
    * @param str String representing the word we want to process.
    * @param i Integer representing the i-th character of the string.
    */
    private boolean isConsonant(String str, int i)
    {
        switch (str.charAt(i))
        {
            case 'a':
            case 'e':
            case 'i':
            case 'o':
            case 'u':
                    return false;
            case 'y':
                    return (i==0) ? true : !isConsonant(str, i-1);
            default:
                    return true;
        }
    }


    /**
    * It returns true if the two last characters of a word are the same consonants.
    * It returns false otherwise.
    * @param str String representing the word we want to process.
    */
    private boolean endsWithDoubleConsonant(String str)
    {
        boolean result = false;

        if (str.length() >= 2)
        {
            if (str.charAt(str.length() - 1) == str.charAt(str.length() - 2) &&
                isConsonant(str, str.length() - 1))
                    result = true;
        }

        return result;

    }


    /**
    * It returns the CVC measure of a string as it is defined in Porter's algorithm.
    * Any word, or part of a word can be represented by: [C](VC)^m[V],
    * where m is called measure.
    * @param str String representing the word we want to process.
    */
    private int stringMeasure(String str)
    {
        int measure = 0;      //measure
        boolean ok = false;   //if we meet a vowel, ok=true

        for (int i=0; i < str.length(); i++)
        {
            if (isVowel(str, i))
                ok = true;
            else if (ok)
            {
                measure += 1;
                ok = false;
            }
        }

        return measure;
    }


    /**
    * It returns true if a string ends with CVC, where the second c is not W, X or Y.
    * It returns false otherwise.
    * @param str String representing the word we want to process.
    */
    private boolean endsWithCVC(String str)
    {
        char c1, v, c2;
        //crar c2= ' ';

        if (str.length() >= 3)
        {
            c1 = str.charAt(str.length() - 1);
            v = str.charAt(str.length() - 2);
            c2 = str.charAt(str.length() - 3);
        }
        else
            return false;

        if (c1 == 'w' || c1 == 'x' || c1 == 'y')
            return false;
        else if (! (isConsonant(str, str.length() - 1) &&
                    isVowel(str, str.length() - 2) &&
                    isConsonant(str, str.length() - 3)) )
            return false;
        else
            return true;
    }

}//PorterStemmer
