package de.xam.textsearch.tokenize;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import com.google.common.collect.Lists;
import com.google.gwt.regexp.shared.SplitResult;

import de.xam.texthtml.text.CamelCase;
import de.xam.texthtml.text.Unicodes;
import de.xam.textsearch.util.TextIndexTool;

/**
 * Some built-in {@link ITokenizer}
 *
 * phrase -> word (separated by space) -> token (separated by punctuation-like
 * characters) -> fragment (separated by alternating character family: casing,
 * letter vs. number)
 *
 * @author xamde
 *
 */
public class Tokenizers {

	/** Very simple tokenizer, just at space IMPROVE performance: create a native version without regexes */
	public static final ITokenizer SPACE = new RegexTokenizer("[ ]");

	/**
	 * Split a phrase into words, separated by whitespace. Note that resulting
	 * words include leading and trailing punctuation.
	 */
	static String PHRASE2WORDREGEX = "[ \\r\\n\\t]+";

	/**
	 * Split a word into tokens. E.g. "Super-Product2014LE/14" becomes "Super",
	 * "Product2014LE", and "14"
	 */
	static final String WORD2TOKENSREGEX =

			"[" // .
			+ " " // .
			+ "_"// .
			+ ","// .
			+ ":"// .
			+ ";"// .
			+ "!"// .
			+ "?"// .
			+ "'"// .
			+ "@"// .
			+ "*"// .
			+ "/"// .
			+ "\""// .
			+ "\\-"// .
			+ "\\."// .
			+ "\\("// .
			+ "\\)"// .
			+ "\\{"// .
			+ "\\}"// .
			+ "\\\\"// .
			+ "\\n"// .
			+ "\\r"// .
			+ "\\t"// .
			+ "&"// .
			+ "#"// .
			+ "+"// .
			+ "<"// .
			+ "="// .
			+ ">"// .
			+ "~"// .
			+ "$"// .
			+ "%"// .
			+ "|"// .
			+ "§"// .

			+ "]+";

	public static final ITokenizer WORD_OR_PHRASE_2_FRAGMENT_TOKENIZER__RANGEBASED = new ITokenizer() {

		@Override
		public synchronized SplitResult split(final String word) {
			assert word != null;
			/* we sacrifice speed a little in order to keep a stable order */
			final List<String> fragments = new ArrayList<String>();

			final SplitResult tokenSplit = WORD_2_TOKEN_TOKENIZER__RANGEBASED.split(word);
			for (int i = 0; i < tokenSplit.length(); i++) {
				final String token = tokenSplit.get(i);
				final SplitResult fragmentSplit = TOKEN_2_FRAGMENT_TOKENIZER.split(token);
				for (int j = 0; j < fragmentSplit.length(); j++) {
					if (!fragments.contains(fragmentSplit.get(j))) {
						fragments.add(fragmentSplit.get(j));
					}
				}
			}
			return new SplitResult(Lists.newArrayList(fragments).toArray(
					new String[fragments.size()]));
		}
	};

	public static final ITokenizer PHRASE_2_WORD_TOKENIZER__RANGEBASED = new RangeBasedTokenizer(
			Unicodes.unicodePureSeparator);

	public static final ITokenizer WORD_2_TOKEN_TOKENIZER__RANGEBASED = new RangeBasedTokenizer(
			Unicodes.unicodeSeparator_or_mixed);

	/** split CamelCase and Word123 into fragments */
	public static final ITokenizer TOKEN_2_FRAGMENT_TOKENIZER = new ITokenizer() {

		@Override
		public synchronized SplitResult split(final String token) {
			/* we sacrifice speed a little in order to keep a stable order */
			final List<String> fragments = new ArrayList<String>();
			final String[] tokenFragments = CamelCase.splitCamelCaseAndDigits(token);
			for (int j = 0; j < tokenFragments.length; j++) {
				if (!fragments.contains(tokenFragments[j])) {
					fragments.add(tokenFragments[j]);
				}
			}
			return new SplitResult(fragments.toArray(new String[fragments.size()]));
		}
	};

	public static Iterator<String> tokenizeToIterator(final String word, final ITokenizer tokenizer) {
		return new TextIndexTool.SplitResultIterator(tokenizer.split(word));
	}

}
