package edu.cmu.minorthird.text.learn;

import cern.colt.matrix.impl.AbstractFormatter;
import edu.cmu.minorthird.text.AbstractAnnotator;
import edu.cmu.minorthird.text.FancyLoader;
import edu.cmu.minorthird.text.MonotonicTextLabels;
import edu.cmu.minorthird.text.MutableTextLabels;
import edu.cmu.minorthird.text.RegexTokenizer;
import edu.cmu.minorthird.text.Span;
import edu.cmu.minorthird.text.SpanDifference;
import edu.cmu.minorthird.text.TextLabels;
import edu.cmu.minorthird.text.TextLabelsLoader;
import edu.cmu.minorthird.text.gui.TextBaseViewer;
import edu.cmu.minorthird.text.mixup.MixupInterpreter;
import edu.cmu.minorthird.text.mixup.MixupProgram;
import edu.cmu.minorthird.util.IOUtil;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import org.apache.log4j.Logger;

/* loaded from: input_file:edu/cmu/minorthird/text/learn/NameMatcher.class */
public class NameMatcher extends AbstractAnnotator {
    private String predType;
    private String spanType;
    private ArrayList nameDict;
    private static final String DIV = "@#!";
    private static final int WINDOW_SIZE = 5;
    private static final int SIG_SIZE = 2;
    private ArrayList lowRiskNameList;
    private ArrayList highRiskNameList;
    private ArrayList deletedNameList;
    private static Logger log = Logger.getLogger(NameMatcher.class);
    private static double threshold = 16.0d;
    private static MonotonicTextLabels postLabels = null;
    private static boolean Remove_Single_Tokens_Low_PFIDF = true;

    public NameMatcher(String str) {
        this.predType = "_prediction";
        this.spanType = "true_name";
        this.nameDict = new ArrayList();
        this.lowRiskNameList = new ArrayList();
        this.highRiskNameList = new ArrayList();
        this.deletedNameList = new ArrayList();
        this.spanType = str;
    }

    public NameMatcher() {
        this.predType = "_prediction";
        this.spanType = "true_name";
        this.nameDict = new ArrayList();
        this.lowRiskNameList = new ArrayList();
        this.highRiskNameList = new ArrayList();
        this.deletedNameList = new ArrayList();
    }

    @Override // edu.cmu.minorthird.text.AbstractAnnotator
    protected void doAnnotate(MonotonicTextLabels monotonicTextLabels) {
        HashSet hashSet = new HashSet();
        Span.Looper instanceIterator = monotonicTextLabels.instanceIterator(this.predType);
        while (instanceIterator.hasNext()) {
            hashSet.add(((Span) instanceIterator.next()).asString());
        }
        this.nameDict = new ArrayList(hashSet);
        Collections.sort(this.nameDict, new Comparator() { // from class: edu.cmu.minorthird.text.learn.NameMatcher.1
            @Override // java.util.Comparator
            public int compare(Object obj, Object obj2) {
                return new Integer(((String) obj2).length()).compareTo(new Integer(((String) obj).length()));
            }
        });
        transformDict(new FreqAnal(monotonicTextLabels, this.predType));
        int i = 0;
        System.out.println("Low Risk Names:");
        Iterator it = this.lowRiskNameList.iterator();
        while (it.hasNext()) {
            i++;
            System.out.println(i + ". " + it.next());
        }
        int i2 = 0;
        System.out.println("High Risk Names:");
        Iterator it2 = this.highRiskNameList.iterator();
        while (it2.hasNext()) {
            i2++;
            System.out.println(i2 + ". " + it2.next());
        }
        int i3 = 0;
        System.out.println("Deleted Names:");
        Iterator it3 = this.deletedNameList.iterator();
        while (it3.hasNext()) {
            i3++;
            System.out.println(i3 + ". " + it3.next());
        }
        applyDictIncreaseRecall(monotonicTextLabels);
        if (Remove_Single_Tokens_Low_PFIDF) {
            applyDictIncreasePrecision(postLabels);
        }
    }

    @Override // edu.cmu.minorthird.text.AbstractAnnotator, edu.cmu.minorthird.text.Annotator
    public String explainAnnotation(TextLabels textLabels, Span span) {
        return "No explanation implemented.";
    }

    private void applyDictIncreaseRecall(MonotonicTextLabels monotonicTextLabels) {
        int i = 0;
        Span.Looper documentSpanIterator = monotonicTextLabels.getTextBase().documentSpanIterator();
        while (documentSpanIterator.hasNext()) {
            Span nextSpan = documentSpanIterator.nextSpan();
            i++;
            System.out.println(((i / monotonicTextLabels.getTextBase().size()) * 100.0f) + "% Working on " + nextSpan.getDocumentId() + "...");
            int i2 = 0;
            while (i2 < nextSpan.size()) {
                Span subSpan = nextSpan.subSpan(i2, Math.min(nextSpan.size() - i2, 5));
                Span dictLookup = dictLookup(this.lowRiskNameList, subSpan);
                if (dictLookup != null) {
                    System.out.println("! Found: " + dictLookup.asString().replaceAll("[\r\n\\s]+", AbstractFormatter.DEFAULT_COLUMN_SEPARATOR) + " matches " + subSpan.asString().replaceAll("[\r\n\\s]+", AbstractFormatter.DEFAULT_COLUMN_SEPARATOR));
                    monotonicTextLabels.addToType(dictLookup, this.predType + "_updated");
                    i2 += dictLookup.size() - 1;
                }
                i2++;
            }
            int size = nextSpan.size() - 2;
            while (size < nextSpan.size()) {
                Span subSpan2 = nextSpan.subSpan(size, Math.min(nextSpan.size() - size, 5));
                Span dictLookup2 = dictLookup(this.highRiskNameList, subSpan2);
                if (dictLookup2 != null) {
                    System.out.println("! Found: " + dictLookup2.asString().replaceAll("[\r\n\\s]+", AbstractFormatter.DEFAULT_COLUMN_SEPARATOR) + " matches " + subSpan2.asString().replaceAll("[\r\n\\s]+", AbstractFormatter.DEFAULT_COLUMN_SEPARATOR));
                    monotonicTextLabels.addToType(dictLookup2, this.predType + "_updated");
                    size += dictLookup2.size() - 1;
                }
                size++;
            }
        }
        postLabels = monotonicTextLabels;
    }

    private void applyDictIncreasePrecision(MonotonicTextLabels monotonicTextLabels) {
        int i = 0;
        Span.Looper documentSpanIterator = monotonicTextLabels.getTextBase().documentSpanIterator();
        while (documentSpanIterator.hasNext()) {
            Span nextSpan = documentSpanIterator.nextSpan();
            i++;
            System.out.println(((i / monotonicTextLabels.getTextBase().size()) * 100.0f) + "% Working on " + nextSpan.getDocumentId() + "...");
            Span.Looper instanceIterator = monotonicTextLabels.instanceIterator(this.predType, nextSpan.getDocumentId());
            while (instanceIterator.hasNext()) {
                Span nextSpan2 = instanceIterator.nextSpan();
                if (nextSpan2.size() == 1) {
                    if (this.deletedNameList.contains(nextSpan2.getToken(0).getValue().toLowerCase())) {
                        monotonicTextLabels.setProperty(nextSpan2.getToken(0), "delete", "t");
                    }
                }
            }
        }
        postLabels = monotonicTextLabels;
    }

    private Span dictLookup(ArrayList arrayList, Span span) {
        RegexTokenizer regexTokenizer = new RegexTokenizer();
        Iterator it = arrayList.iterator();
        while (it.hasNext()) {
            String str = (String) it.next();
            if (span.asString().replaceAll("[\r\n\\s]+", AbstractFormatter.DEFAULT_COLUMN_SEPARATOR).toLowerCase().matches("(?i)(?s)^\\Q" + str + "\\E(\\W|$).*")) {
                return span.subSpan(0, regexTokenizer.splitIntoTokens(str).length);
            }
        }
        return null;
    }

    private void transformDict(FreqAnal freqAnal) {
        Iterator it = this.nameDict.iterator();
        while (it.hasNext()) {
            Iterator it2 = transformName((String) it.next()).iterator();
            while (it2.hasNext()) {
                String str = (String) it2.next();
                boolean z = str.indexOf(DIV) == -1;
                boolean matches = str.matches("(\\w@#!)+");
                String replaceAll = str.replaceAll(DIV, "");
                Double hScore = freqAnal.getHScore(replaceAll);
                if (hScore != null && hScore.doubleValue() < threshold) {
                    this.deletedNameList.add(replaceAll);
                } else if (z) {
                    this.lowRiskNameList.add(replaceAll);
                } else if (matches) {
                    this.highRiskNameList.add(replaceAll);
                }
            }
        }
        this.lowRiskNameList = uniqueSortedList(this.lowRiskNameList);
        this.highRiskNameList = uniqueSortedList(this.highRiskNameList);
        this.deletedNameList = uniqueSortedList(this.deletedNameList);
    }

    /* JADX WARN: Multi-variable type inference failed */
    /* JADX WARN: Type inference failed for: r0v18, types: [int[], int[][]] */
    /* JADX WARN: Type inference failed for: r0v22, types: [int[], int[][]] */
    /* JADX WARN: Type inference failed for: r0v26, types: [int[], int[][]] */
    /* JADX WARN: Type inference failed for: r0v44, types: [int[], int[][]] */
    private ArrayList transformName(String str) {
        ArrayList arrayList = new ArrayList();
        String[] split = str.toLowerCase().trim().replaceAll("[^a-zA-Z\\- ]+", "").split("[\\- ]+");
        Object[] objArr = new Object[0];
        if (split.length == 1) {
            objArr = transform(split, new int[]{new int[]{0}});
        } else if (split.length == 2) {
            objArr = transform(split, new int[]{new int[]{0, 1}, new int[]{0}});
        } else if (split.length == 3) {
            objArr = transform(split, new int[]{new int[]{0, 1, 2}, new int[]{0, 2}, new int[]{2}, new int[]{0}});
        } else if (split.length == 4) {
            objArr = transform(split, new int[]{new int[]{0, 1, 2, 3}, new int[]{0, 1, 3}, new int[]{0, 3}, new int[]{3}, new int[]{0}});
        }
        for (Object obj : objArr) {
            String trim = ((String) obj).trim();
            if (trim.replaceAll("\\W", "").length() >= 2 && !trim.matches(".*-$")) {
                arrayList.add(trim);
            }
        }
        return arrayList;
    }

    /* JADX WARN: Multi-variable type inference failed */
    private Object[] transform(String[] strArr, int[][] iArr) {
        ArrayList arrayList = new ArrayList();
        Object[] objArr = new Object[strArr.length];
        int i = 0;
        while (i < strArr.length) {
            objArr[i] = transformToken(strArr[i], i == 0, i == strArr.length - 1);
            i++;
        }
        for (int[] iArr2 : iArr) {
            if (iArr2.length == 1) {
                for (int i2 = 0; i2 < objArr[iArr2[0]].length; i2++) {
                    arrayList.add(objArr[iArr2[0]][i2]);
                }
            } else if (iArr2.length == 2) {
                for (int i3 = 0; i3 < objArr[iArr2[0]].length; i3++) {
                    for (int i4 = 0; i4 < objArr[iArr2[1]].length; i4++) {
                        arrayList.add(((String) objArr[iArr2[0]][i3]) + objArr[iArr2[1]][i4]);
                    }
                }
            } else if (iArr2.length == 3) {
                for (int i5 = 0; i5 < objArr[iArr2[0]].length; i5++) {
                    for (int i6 = 0; i6 < objArr[iArr2[1]].length; i6++) {
                        for (int i7 = 0; i7 < objArr[iArr2[2]].length; i7++) {
                            arrayList.add(((String) objArr[iArr2[0]][i5]) + objArr[iArr2[1]][i6] + objArr[iArr2[2]][i7]);
                        }
                    }
                }
            } else if (iArr2.length == 4) {
                for (int i8 = 0; i8 < objArr[iArr2[0]].length; i8++) {
                    for (int i9 = 0; i9 < objArr[iArr2[1]].length; i9++) {
                        for (int i10 = 0; i10 < objArr[iArr2[2]].length; i10++) {
                            for (int i11 = 0; i11 < objArr[iArr2[3]].length; i11++) {
                                arrayList.add(((String) objArr[iArr2[0]][i8]) + objArr[iArr2[1]][i9] + objArr[iArr2[2]][i10] + objArr[iArr2[3]][i11]);
                            }
                        }
                    }
                }
            }
        }
        return arrayList.toArray();
    }

    private ArrayList uniqueSortedList(ArrayList arrayList) {
        HashMap hashMap = new HashMap();
        Iterator it = arrayList.iterator();
        while (it.hasNext()) {
            hashMap.put((String) it.next(), null);
        }
        ArrayList arrayList2 = new ArrayList(hashMap.keySet());
        Collections.sort(arrayList2, new Comparator() { // from class: edu.cmu.minorthird.text.learn.NameMatcher.2
            @Override // java.util.Comparator
            public int compare(Object obj, Object obj2) {
                return new Integer(((String) obj2).length()).compareTo(new Integer(((String) obj).length()));
            }
        });
        return arrayList2;
    }

    private Object[] transformToken(String str, boolean z, boolean z2) {
        ArrayList arrayList = new ArrayList();
        if (str.length() == 0) {
            return arrayList.toArray();
        }
        if (z2) {
            arrayList.add(str);
        }
        if (!z2) {
            arrayList.add(str + AbstractFormatter.DEFAULT_COLUMN_SEPARATOR);
        }
        if (!z2) {
            arrayList.add(str + "-");
        }
        if (!z2) {
            arrayList.add(str.substring(0, 1) + ". ");
        }
        if (z2) {
            arrayList.add(str.substring(0, 1) + ".");
        }
        arrayList.add(str.substring(0, 1) + DIV);
        return arrayList.toArray();
    }

    private static void usage() {
        System.err.println("ExtractorNameMatcher: increase recall of a previously-learned extractor, ");
        System.err.println("applying a name matching scheme");
        System.err.println("Parameters:");
        System.err.println(" -loadFrom FILE     where to load a previously-learner extractor from");
        System.err.println(" -labels KEY        the key for the labels, in which names are to be extracted");
        System.err.println(" -spanType String   the span type of the true names. Usually, it is 'true_name'");
        System.err.println(" [-saveAs FILE]     a file to save the new post-name matching labels");
        System.err.println("");
        System.exit(1);
    }

    public static void main(String[] strArr) throws IOException {
        File file = null;
        File file2 = new File("NM_labels.env");
        String str = "";
        MutableTextLabels mutableTextLabels = null;
        NameMatcher nameMatcher = new NameMatcher(str);
        for (int i = 0; i < strArr.length; i++) {
            if (strArr[i].equals("-loadFrom")) {
                file = new File(strArr[i + 1]);
            } else if (strArr[i].equals("-saveAs")) {
                file2 = new File(strArr[i + 1]);
            } else if (strArr[i].equals("-labels")) {
                mutableTextLabels = (MutableTextLabels) FancyLoader.loadTextLabels(strArr[i + 1]);
            } else if (strArr[i].equals("-spanType")) {
                str = strArr[i + 1];
            }
        }
        if (file == null || mutableTextLabels == null || str == null) {
            usage();
        }
        try {
            nameMatcher.doAnnotate((MonotonicTextLabels) ((ExtractorAnnotator) IOUtil.loadSerialized(file)).annotatedCopy(mutableTextLabels));
            MixupProgram mixupProgram = null;
            try {
                mixupProgram = new MixupProgram(new String[]{"defTokenProp email:t = ~re'([\\.\\-\\w+]+\\@[\\.\\-\\w\\+]+)',1;"});
                mixupProgram.addStatement("defSpanType email =: ... [email:t+R] ... ;");
                mixupProgram.addStatement("defTokenProp predicted_name:1 =: ... [@_prediction_updated] ... || ... [@_prediction] ... ;");
                mixupProgram.addStatement("defSpanType _prediction_updated_fixed =: ... [L <predicted_name:1, !email:t, !delete:t>+ R] ... ;");
            } catch (Exception e) {
                System.out.println(e);
            }
            new MixupInterpreter(mixupProgram).eval(postLabels);
            TextBaseViewer.view(postLabels);
            if (file2 != null) {
                try {
                    new TextLabelsLoader().saveTypesAsOps(postLabels, file2);
                } catch (IOException e2) {
                    try {
                        new TextLabelsLoader().saveTypesAsOps(postLabels, new File("name-matching-labels.env"));
                    } catch (Exception e3) {
                        System.out.println(e3);
                    }
                }
            }
            System.out.println("============================================================");
            System.out.println("Pre names-matching:");
            System.out.println(new SpanDifference(postLabels.instanceIterator(nameMatcher.predType), postLabels.instanceIterator(str), postLabels.closureIterator(str)).toSummary());
            System.out.println("Post names-matching:");
            System.out.println(new SpanDifference(postLabels.instanceIterator(nameMatcher.predType + "_updated_fixed"), postLabels.instanceIterator(str), postLabels.closureIterator(str)).toSummary());
        } catch (IOException e4) {
            throw new IllegalArgumentException("can't load annotator from " + file + ": " + e4);
        }
    }
}
