package edu.cmu.minorthird.text;

import java.io.InputStream;
import java.util.ArrayList;
import java.util.Properties;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;

/* loaded from: input_file:edu/cmu/minorthird/text/RegexTokenizer.class */
public class RegexTokenizer implements Tokenizer {
    private static Logger log = Logger.getLogger(RegexTokenizer.class);
    public static final String TOKEN_REGEX_PROP = "edu.cmu.minorthird.tokenRegex";
    public static final String TOKEN_REGEX_DEFAULT_VALUE = "\\s*([0-9]+|[a-zA-Z]+|\\W)\\s*";
    public static String standardTokenRegexPattern;
    public String regexPattern;

    public RegexTokenizer() {
        this.regexPattern = standardTokenRegexPattern;
    }

    public RegexTokenizer(String str) {
        this.regexPattern = standardTokenRegexPattern;
        this.regexPattern = str;
    }

    @Override // edu.cmu.minorthird.text.Tokenizer
    public String[] splitIntoTokens(String str) {
        ArrayList arrayList = new ArrayList();
        Matcher matcher = Pattern.compile(this.regexPattern).matcher(str);
        while (matcher.find()) {
            arrayList.add(matcher.group(1));
        }
        return (String[]) arrayList.toArray(new String[arrayList.size()]);
    }

    @Override // edu.cmu.minorthird.text.Tokenizer
    public TextToken[] splitIntoTokens(Document document) {
        ArrayList arrayList = new ArrayList();
        Matcher matcher = Pattern.compile(this.regexPattern).matcher(document.getText());
        while (matcher.find()) {
            arrayList.add(new TextToken(document, matcher.start(1), matcher.end(1) - matcher.start(1)));
        }
        return (TextToken[]) arrayList.toArray(new TextToken[0]);
    }

    static {
        Properties properties = new Properties();
        try {
            InputStream resourceAsStream = FancyLoader.class.getClassLoader().getResourceAsStream("token.properties");
            if (resourceAsStream != null) {
                properties.load(resourceAsStream);
                log.debug("loaded properties from stream " + resourceAsStream);
            } else {
                log.info("no token.properties found on classpath");
            }
        } catch (Exception e) {
            log.debug("can't open token.properties:" + e);
        }
        standardTokenRegexPattern = properties.getProperty(TOKEN_REGEX_PROP, System.getProperty(TOKEN_REGEX_PROP, TOKEN_REGEX_DEFAULT_VALUE));
        log.info("tokenization regex: " + standardTokenRegexPattern);
    }
}
