package com.wcohen.ss.expt;

import cern.colt.matrix.impl.AbstractFormatter;
import com.wcohen.ss.api.Token;
import com.wcohen.ss.api.Tokenizer;
import com.wcohen.ss.expt.Blocker;
import com.wcohen.ss.expt.MatchData;
import com.wcohen.ss.tokens.SimpleTokenizer;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;

/* loaded from: input_file:com/wcohen/ss/expt/TokenBlocker.class */
public class TokenBlocker extends Blocker {
    private static double defaultMaxFraction;
    private static final Set STOPWORD_TOKEN_MARKER;
    private ArrayList pairList;
    protected Tokenizer tokenizer;
    private double maxFraction;
    private int numCorrectPairs;

    public TokenBlocker(Tokenizer tokenizer, double d) {
        this.tokenizer = tokenizer;
        this.maxFraction = d;
    }

    public TokenBlocker() {
        this(SimpleTokenizer.DEFAULT_TOKENIZER, defaultMaxFraction);
    }

    public double getMaxFraction() {
        return this.maxFraction;
    }

    public void setMaxFraction(double d) {
        this.maxFraction = d;
    }

    @Override // com.wcohen.ss.expt.Blocker
    public void block(MatchData matchData) {
        this.numCorrectPairs = countCorrectPairs(matchData);
        this.pairList = new ArrayList();
        if (!this.clusterMode && matchData.numSources() != 2) {
            throw new IllegalArgumentException("need exactly two sources out of clusterMode");
        }
        if (this.clusterMode && matchData.numSources() != 1) {
            throw new IllegalArgumentException("need exactly one source in clusterMode");
        }
        String source = matchData.getSource(0);
        String source2 = this.clusterMode ? matchData.getSource(0) : matchData.getSource(1);
        if (matchData.numInstances(source) > matchData.numInstances(source2)) {
            source = source2;
            source2 = source;
        }
        double numInstances = matchData.numInstances(source) * this.maxFraction;
        TreeMap treeMap = new TreeMap();
        for (int i = 0; i < matchData.numInstances(source); i++) {
            Token[] tokenArr = this.tokenizer.tokenize(matchData.getInstance(source, i).unwrap());
            for (int i2 = 0; i2 < tokenArr.length; i2++) {
                Set set = (Set) treeMap.get(tokenArr[i2]);
                if (set != STOPWORD_TOKEN_MARKER && set == null) {
                    set = new TreeSet();
                    treeMap.put(tokenArr[i2], set);
                }
                set.add(new Integer(i));
                if (set.size() > numInstances) {
                    treeMap.put(tokenArr[i2], STOPWORD_TOKEN_MARKER);
                }
            }
        }
        TreeSet treeSet = new TreeSet();
        for (int i3 = 0; i3 < matchData.numInstances(source2); i3++) {
            MatchData.Instance matchData2 = matchData.getInstance(source2, i3);
            treeSet.clear();
            for (Token token : this.tokenizer.tokenize(matchData2.unwrap())) {
                Set<Integer> set2 = (Set) treeMap.get(token);
                if (set2 != null && set2 != STOPWORD_TOKEN_MARKER) {
                    for (Integer num : set2) {
                        int intValue = num.intValue();
                        if (!treeSet.contains(num) && (source != source2 || intValue > i3)) {
                            MatchData.Instance matchData3 = matchData.getInstance(source, intValue);
                            this.pairList.add(new Blocker.Pair(matchData2, matchData3, matchData3.sameId(matchData2)));
                            treeSet.add(num);
                        }
                    }
                }
            }
        }
    }

    @Override // com.wcohen.ss.expt.Blocker
    public int size() {
        return this.pairList.size();
    }

    @Override // com.wcohen.ss.expt.Blocker
    public Blocker.Pair getPair(int i) {
        return (Blocker.Pair) this.pairList.get(i);
    }

    public String toString() {
        return "[TokenBlocker:clusterMode=" + this.clusterMode + ",maxFraction=" + this.maxFraction + "]";
    }

    @Override // com.wcohen.ss.expt.Blocker
    public int numCorrectPairs() {
        return this.numCorrectPairs;
    }

    private void showIndex(Map map) {
        for (Token token : map.keySet()) {
            System.out.print(token.toString());
            Iterator it = ((Set) map.get(token)).iterator();
            while (it.hasNext()) {
                System.out.print(AbstractFormatter.DEFAULT_COLUMN_SEPARATOR + ((Integer) it.next()));
            }
            System.out.println();
        }
    }

    static {
        defaultMaxFraction = 1.0d;
        try {
            String property = System.getProperty("blockerMaxFraction");
            if (property != null) {
                defaultMaxFraction = Double.parseDouble(property);
            }
        } catch (NumberFormatException e) {
        }
        STOPWORD_TOKEN_MARKER = new HashSet();
    }
}
