package de.xam.textsearch.tokenize;

import java.util.ArrayList;

import org.xydra.index.IIntegerRangeIndex;
import org.xydra.index.impl.IntegerRangeIndex;
import org.xydra.index.impl.IntegerRangeIndex.ISplitHandler;

import com.google.gwt.regexp.shared.SplitResult;

import de.xam.texthtml.text.Unicodes;

/**
 * An {@link ITokenizer} using an {@link IIntegerRangeIndex} of unicode ranges to split an input text into ranges.
 * Returns only ranges with "none of the characters are in given separator-rangeindex". This is like character groups in
 * a regex.
 *
 * Not solved by this tokenizer: camelcasing, letternumber ('foobar13' remains a single token)
 *
 * @author xamde
 */
public class RangeBasedTokenizer implements ITokenizer {

	private final IIntegerRangeIndex separators;

	/**
	 * @param separators
	 */
	public RangeBasedTokenizer(final IIntegerRangeIndex separators) {
		this.separators = separators;
	}

	@Override
	public synchronized SplitResult split(final String term) {
		assert term != null;

		final ArrayList<String> resultList = new ArrayList<String>();
		IntegerRangeIndex.split(term, 0, term.length(), this.separators, new ISplitHandler() {

			@Override
			public void onToken(final int startInclusive, final int endExclusive) {
				final String token = term.substring(startInclusive, endExclusive);
				resultList.add(token);
			}

			@Override
			public void onSeparator(final int startInclusive, final int endExclusive) {}

			@Override
			public void onDone() {}

		});

		final String[] result = resultList.toArray(new String[resultList.size()]);
		final SplitResult splitResult = new SplitResult(result);
		return splitResult;
	}

	public static void main(final String[] args) {
		final String s = "Hello World--this is a 123ä+test";
		IntegerRangeIndex.split(s, 0, s.length(), Unicodes.unicodeSeparator_or_mixed, new ISplitHandler() {

			@Override
			public void onToken(final int startInclusive, final int endExclusive) {
				System.out.println("token '" + s.substring(startInclusive, endExclusive) + "'");
			}

			@Override
			public void onSeparator(final int startInclusive, final int endExclusive) {
				System.out.println("sep '" + s.substring(startInclusive, endExclusive) + "'");
			}

			@Override
			public void onDone() {}

		});
	}

}
