/*
 * Decompiled with CFR 0.152.
 */
package org.tribuo.util.tokens.impl.wordpiece;

import com.oracle.labs.mlrg.olcut.config.Config;
import com.oracle.labs.mlrg.olcut.config.Configurable;
import com.oracle.labs.mlrg.olcut.provenance.ConfiguredObjectProvenance;
import com.oracle.labs.mlrg.olcut.provenance.impl.ConfiguredObjectProvenanceImpl;
import org.tribuo.util.tokens.impl.SplitFunctionTokenizer;

public class WordpieceBasicTokenizer
extends SplitFunctionTokenizer {
    @Config(description="split on Chinese tokens?")
    private boolean tokenizeChineseChars = true;

    public static SplitFunctionTokenizer.SplitFunction createSplitFunction(boolean tokenizeChineseChars) {
        return (codepoint, index, cs) -> {
            if (Character.isWhitespace(codepoint)) {
                return SplitFunctionTokenizer.SplitResult.SPLIT_AT;
            }
            if (codepoint == 160) {
                return SplitFunctionTokenizer.SplitResult.SPLIT_AT;
            }
            if (WordpieceBasicTokenizer.isPunctuation(codepoint)) {
                return SplitFunctionTokenizer.SplitResult.SPLIT_BEFORE_AND_AFTER_PUNCTUATION;
            }
            if (tokenizeChineseChars && WordpieceBasicTokenizer.isChinese(codepoint)) {
                return SplitFunctionTokenizer.SplitResult.SPLIT_BEFORE_AND_AFTER_WORD;
            }
            if (codepoint == 0 || codepoint == 65533 || WordpieceBasicTokenizer.isControl(codepoint)) {
                return SplitFunctionTokenizer.SplitResult.SPLIT_AT;
            }
            return SplitFunctionTokenizer.SplitResult.NO_SPLIT_WORD;
        };
    }

    public static boolean isPunctuation(int codepoint) {
        if (codepoint >= 33 && codepoint <= 47) {
            return true;
        }
        if (codepoint >= 58 && codepoint <= 64) {
            return true;
        }
        if (codepoint >= 91 && codepoint <= 96) {
            return true;
        }
        if (codepoint >= 123 && codepoint <= 126) {
            return true;
        }
        int charType = Character.getType(codepoint);
        return charType == 20 || charType == 21 || charType == 22 || charType == 23 || charType == 24 || charType == 29 || charType == 30;
    }

    public static boolean isChinese(int codepoint) {
        return codepoint >= 19968 && codepoint <= 40959 || codepoint >= 13312 && codepoint <= 19903 || codepoint >= 131072 && codepoint <= 173791 || codepoint >= 173824 && codepoint <= 177983 || codepoint >= 177984 && codepoint <= 178207 || codepoint >= 178208 && codepoint <= 183983 || codepoint >= 63744 && codepoint <= 64255 || codepoint >= 194560 && codepoint <= 195103;
    }

    public static boolean isControl(int codepoint) {
        char c = Character.toChars(codepoint)[0];
        if (c == '\t' || c == '\n' || c == '\r') {
            return false;
        }
        int charType = Character.getType(codepoint);
        return charType == 15 || charType == 16 || charType == 18 || charType == 19;
    }

    public WordpieceBasicTokenizer() {
        this.postConfig();
    }

    public WordpieceBasicTokenizer(boolean tokenizeChineseChars) {
        this.tokenizeChineseChars = tokenizeChineseChars;
        this.postConfig();
    }

    public void postConfig() {
        this.splitFunction = WordpieceBasicTokenizer.createSplitFunction(this.tokenizeChineseChars);
    }

    public ConfiguredObjectProvenance getProvenance() {
        return new ConfiguredObjectProvenanceImpl((Configurable)this, "Tokenizer");
    }

    @Override
    public WordpieceBasicTokenizer clone() {
        return new WordpieceBasicTokenizer(this.tokenizeChineseChars);
    }
}

