package com.wandoujia.nerkit.train;

import com.wandoujia.nerkit.LoggerFactory;
import com.wandoujia.nerkit.extractor.Extractor;
import com.wandoujia.nerkit.extractor.MiraExtractor;
import com.wandoujia.nerkit.nlp.IOB2;
import com.wandoujia.nerkit.nlp.TokenFeature;
import com.wandoujia.nerkit.nlp.mira.LazyResourceProxy;
import com.wandoujia.nerkit.nlp.mira.Template;
import com.wandoujia.nerkit.nlp.structure.Token;
import com.wandoujia.nerkit.nlp.tokenizer.Tokenizer;
import com.wandoujia.nerkit.util.FeatureUtils;
import com.wandoujia.nerkit.util.ResourceArrayList;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/* loaded from: classes.dex */
public class MiraLearner implements Learner<Extractor> {
    static final LoggerFactory.Logger LOG = LoggerFactory.getLogger(MiraLearner.class);
    static final Pattern TAG_PATTERN = Pattern.compile("<([^>]+)>([^<]*)</[^>]+>");
    final int cutoff;
    final List<TokenFeature> features;
    final int iterations;
    final boolean randomInit;
    final LazyResourceProxy<ResourceArrayList<String>> samples;
    final List<Template> templates;
    final Tokenizer tokenizer;

    public MiraLearner(List<TokenFeature> list, Tokenizer tokenizer, List<Template> list2, int i, int i2, boolean z, List<String> list3) {
        this.features = list;
        this.tokenizer = tokenizer;
        this.templates = list2;
        this.iterations = i;
        this.cutoff = i2;
        this.randomInit = z;
        this.samples = new LazyResourceProxy<>(new ResourceArrayList(list3));
    }

    List<Token> getOutsideTokens(Matcher matcher, boolean z) {
        StringBuffer stringBuffer = new StringBuffer();
        if (z) {
            matcher.appendTail(stringBuffer);
        } else {
            matcher.appendReplacement(stringBuffer, "");
        }
        return stringBuffer.length() > 0 ? tokenizeWithTag(stringBuffer.toString(), IOB2.TAG_OUTSIDE) : Collections.emptyList();
    }

    List<Token> parseAndTokenize(String str) {
        Matcher matcher = TAG_PATTERN.matcher(str);
        ArrayList arrayList = new ArrayList();
        while (matcher.find()) {
            arrayList.addAll(getOutsideTokens(matcher, false));
            arrayList.addAll(tokenizeWithTag(matcher.group(2), matcher.group(1)));
        }
        arrayList.addAll(getOutsideTokens(matcher, true));
        return arrayList;
    }

    List<Token> tokenizeWithTag(String str, String str2) {
        ArrayList arrayList = new ArrayList();
        boolean z = true;
        for (Token token : this.tokenizer.getTokens(str)) {
            String str3 = z ? IOB2.TAG_PREFIX_BEGIN : IOB2.TAG_PREFIX_INSIDE;
            if (IOB2.TAG_OUTSIDE.equals(str2)) {
                str3 = "";
            }
            arrayList.add(new Token(token.getText(), str3 + str2, token.getPartOfSpeech()));
            z = false;
        }
        return arrayList;
    }

    /* JADX WARN: Can't rename method to resolve collision */
    @Override // com.wandoujia.nerkit.train.Learner
    public Extractor train() {
        ArrayList arrayList = new ArrayList();
        Iterator<String> it = this.samples.getResource().iterator();
        while (it.hasNext()) {
            arrayList.addAll(FeatureUtils.buildMatrix(this.features, parseAndTokenize(it.next())));
            arrayList.add(new ArrayList());
        }
        return new MiraExtractor(new LazyResourceProxy(new com.wandoujia.nerkit.nlp.mira.Learner().train(this.templates, arrayList, this.iterations, this.cutoff, this.randomInit)), this.features, this.tokenizer);
    }
}
