/*
 * Created on Mar 25, 2005
 *
 * TODO To change the template for this generated file go to
 * Window - Preferences - Java - Code Style - Code Templates
 */
package ro.uottawa.balie;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.ListIterator;

import ca.uottawa.balie.*;

/**
 * @author ONUTZ
 *
 * TODO To change the template for this generated type comment go to
 * Window - Preferences - Java - Code Style - Code Templates
 */
public class RoTokenizer extends Tokenizer {
	
	final static String TOKENIZER_CORPUS = "Tokenizer_Corpus";
	final static String TOKENIZER_RESULTS = "Tokenizer_Results";
	
	public RoTokenizer (String pi_Language, boolean pi_DetectSentenceBoundariesr,boolean pi_bTrimWhiter ){	
      super( pi_Language,  pi_DetectSentenceBoundariesr, pi_bTrimWhiter );  
	}
	
	public void RoTokenize(String pi_Text) {
		System.out.println(pi_Text);
		DebugInfo.Out("Tokenizing", Balie.DEBUG_TOKENIZER);
		
		// let's remove \r since SBR model was trained without them
		String strText = pi_Text.replaceAll("\r", "");
		
		// Put spaces around new line chars so that it is not hidden in the middle of a token
		strText = strText.replaceAll("\n", " \n ");
		
		// Trim leading and trailing white spaces
		// When learning SB, it is usefull to keep white spaces
		if (m_bTrimWhite) {
			strText = strText.trim();
		}

		// Let split the input string on every white char
		String[] rawTokens = strText.split(" ");

		// Intensively re-allocated Structures are out of loop
		String 		strCurrentString 	= "";
		String 		strCurrentRaw 		= "";
		String 		strCurrentCanon 	= "";
		ArrayList 	alTrailBuffer 		= new ArrayList();
		
		for (int i = 0; i != rawTokens.length; ++i) {
			strCurrentString = rawTokens[i];
			
			// Remove empty tokens (consecutive white spaces)
			if (!strCurrentString.equals("")) {
				
				int nStringLength = strCurrentString.length();

				// Let's tokenize leading punctuations
				int nNumLead = 0;
				while (nNumLead != nStringLength && !Character.isLetterOrDigit(strCurrentString.charAt(nNumLead))) {
					strCurrentRaw   = strCurrentString.substring(nNumLead,nNumLead+1);
					strCurrentCanon = Canonizer.CanonForm(strCurrentRaw, Canonizer.RULE_NORMALIZE_PUNCT, m_PunctLookup, m_LigatureLookup);
					if (!strCurrentRaw.equals("\n")){
						RoToken curToken = new RoToken(	strCurrentRaw, 
												strCurrentCanon,
												TokenConsts.TYPE_PUNCTUATION, 
												m_PunctLookup, 
												m_NumTokens++,
												m_NumSentences);
						boolean bIsSB = m_TokenList.Add(curToken, m_SBR, m_SBRModel);
						if (bIsSB) {
							NewSentence();
						}
					}
					++nNumLead;
				}
				
				// Lets buffer trailing punctuations (if different from leading)
				alTrailBuffer.clear();
				int nNumTrail = 0;
				if (nNumLead < nStringLength) {
					while (!Character.isLetterOrDigit(strCurrentString.charAt(nStringLength-nNumTrail-1))) {
						// Handle Unbreakables
						if (m_UnbreakableLookup.IsUnbreakable(strCurrentString.substring(nNumLead, nStringLength-nNumTrail))) {
							break;
						}
						strCurrentRaw = strCurrentString.substring(nStringLength-nNumTrail-1,nStringLength-nNumTrail);
						strCurrentCanon = Canonizer.CanonForm(strCurrentRaw, Canonizer.RULE_NORMALIZE_PUNCT, m_PunctLookup, m_LigatureLookup);
						alTrailBuffer.add(new String[] {strCurrentRaw, strCurrentCanon});
						++nNumTrail;
					}
				}
				
				// Let's tokenize remaining token (if something remains)
				if (nNumLead < nStringLength) {
					String [] strTokens = m_LanguageSpecific.Decompound(strCurrentString.substring(nNumLead,nStringLength-nNumTrail));
					for (int j = 0; j != strTokens.length; ++j) {
						strCurrentRaw    = strTokens[j];
						
						int nTokenType = TokenConsts.TYPE_WORD;
						if (strCurrentRaw.length() == 1 && !Character.isLetterOrDigit(strCurrentRaw.charAt(0))) {
							nTokenType = TokenConsts.TYPE_PUNCTUATION;
							strCurrentCanon  = Canonizer.CanonForm(strCurrentRaw, Canonizer.RULE_NORMALIZE_PUNCT, m_PunctLookup, m_LigatureLookup);					
						} else {
							strCurrentCanon  = Canonizer.CanonForm(strCurrentRaw, m_Rules, m_PunctLookup, m_LigatureLookup);											
						}
						if (nTokenType == TokenConsts.TYPE_WORD){
							boolean bIsSB = super.GetTokenList().Add(new RoToken(	strCurrentRaw,
													strCurrentCanon, 
													nTokenType, 
													m_PunctLookup, 
													m_NumTokens++,
													m_NumSentences), m_SBR, m_SBRModel);
							if (bIsSB) {
								NewSentence();
							}
						}
						else{
							if (!strCurrentRaw.equals("\n")){
								boolean bIsSB = super.GetTokenList().Add(new RoToken( strCurrentRaw,
														strCurrentCanon,
														TokenConsts.TYPE_PUNCTUATION, 
														m_PunctLookup,
														m_NumTokens++,
														m_NumSentences), m_SBR, m_SBRModel);
								if (bIsSB) {
									NewSentence();
								}
						}
					   }
					}
				}
				
				// Lets tokenize buffered trailing punctuations			
				ListIterator iTrailCur =  alTrailBuffer.listIterator(alTrailBuffer.size());
				while (iTrailCur.hasPrevious()) {
					String [] strBuffered = (String[])iTrailCur.previous();
					if (!strBuffered[0].equals("\n")){
						boolean bIsSB = super.GetTokenList().Add(new RoToken(strBuffered[0], 
												strBuffered[1],
												TokenConsts.TYPE_PUNCTUATION, 
												m_PunctLookup,
												m_NumTokens++,
												m_NumSentences), m_SBR, m_SBRModel);
						if (bIsSB) {
							NewSentence();
						}
					}
				}
			}
		}
		// Close last sentence..
		++m_NumSentences;
		
		// Assign part-of-speech
		int[] nPOS = m_POSLookup.GetPartOfSpeech(super.GetTokenList().WordList());
		
		for (int i = 0; i != nPOS.length; ++i) {
			super.GetTokenList().SetPOS(i, nPOS[i]);
		}
		
		if (super.TokenCount() != super.GetTokenList().Size()) {
			throw new Error("Inconsistant TokenList. Num tokens counted by tokenizer and tokenlist mismatch.");
			
		}
	}

	public static void main(String[] args) {	
		
		LanguageIdentification li = new LanguageIdentification();
		File fBasePath = new File(TOKENIZER_CORPUS);
		String[] strTokenFiles = fBasePath.list();
		String strID_Language = "";
		String strContent = "";
		
		for (int i = 0; i != strTokenFiles.length; ++i) {
			if (!strTokenFiles[i].equals("CVS")) {
				try {
					strContent = "";
					File foBasePath = new File(TOKENIZER_RESULTS);
					BufferedWriter out = new BufferedWriter(new FileWriter(
							foBasePath.getAbsolutePath()+ "\\" + strTokenFiles[i].substring(0,(strTokenFiles[i].lastIndexOf(".")))+".xml"));
					BufferedReader in= new BufferedReader(
	                           new InputStreamReader(
	                               new FileInputStream(fBasePath.getAbsolutePath()+ "\\" + strTokenFiles[i]), Balie.UTF8_ENCODING));
					
					strContent = FileHandler.GetTextFileContent(fBasePath
							.getAbsolutePath()
							+ "\\" + strTokenFiles[i], Balie.UTF8_ENCODING);
					strID_Language = li.DetectLanguage(strContent);
					System.out.println(strID_Language);
					if (strID_Language != "LANGUAGE_UNKNOWN") {
						RoTokenizer t = new RoTokenizer(strID_Language, true, true);
						t.RoTokenize(strContent);
						TokenList alTokenList = t.GetTokenList();
						RoTokenList rt = new RoTokenList(true);
						rt.setRoTokenList(alTokenList);
						out.write(rt.ToXML(strID_Language).toString());
						out.flush();
					} else {
						System.out
								.println("I am sorry! I coudn't identify your language!");
					}
				} catch (Exception e) {
					System.out.println(e.getMessage());
				}
			}
		}
	}
}
