package org.apache.tika.parser.html;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.tika.config.Field;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractEncodingDetectorParser;
import org.apache.tika.parser.ParseContext;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.DataNode;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.parser.Parser;
import org.jsoup.parser.Tag;
import org.jsoup.parser.TagSet;
import org.jsoup.select.NodeFilter;
import org.jsoup.select.NodeTraversor;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

/* loaded from: input_file:org/apache/tika/parser/html/JSoupParser.class */
public class JSoupParser extends AbstractEncodingDetectorParser {
    private static final long serialVersionUID = 7895315240498733128L;
    public static final Charset DEFAULT_CHARSET = StandardCharsets.US_ASCII;
    private static final MediaType XHTML = MediaType.application("xhtml+xml");
    private static final MediaType WAP_XHTML = MediaType.application("vnd.wap.xhtml+xml");
    private static final MediaType X_ASP = MediaType.application("x-asp");
    private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(new HashSet(Arrays.asList(MediaType.text("html"), XHTML, WAP_XHTML, X_ASP)));
    private static final TagSet SELF_CLOSEABLE_TAGS = TagSet.Html();

    @Field
    private boolean extractScripts;

    /* loaded from: input_file:org/apache/tika/parser/html/JSoupParser$RuntimeSAXException.class */
    private static class RuntimeSAXException extends RuntimeException {
        private SAXException wrapped;

        private RuntimeSAXException(SAXException sAXException) {
            this.wrapped = sAXException;
        }

        SAXException getWrapped() {
            return this.wrapped;
        }
    }

    /* loaded from: input_file:org/apache/tika/parser/html/JSoupParser$TikaNodeFilter.class */
    private class TikaNodeFilter implements NodeFilter {
        ContentHandler handler;

        private TikaNodeFilter(ContentHandler contentHandler) {
            this.handler = contentHandler;
        }

        @Override // org.jsoup.select.NodeFilter
        public NodeFilter.FilterResult head(Node node, int i) {
            if (node instanceof TextNode) {
                String wholeText = ((TextNode) node).getWholeText();
                if (wholeText != null) {
                    char[] charArray = wholeText.toCharArray();
                    try {
                        if (charArray.length > 0) {
                            this.handler.characters(charArray, 0, charArray.length);
                        }
                    } catch (SAXException e) {
                        throw new RuntimeSAXException(e);
                    }
                }
                return NodeFilter.FilterResult.CONTINUE;
            }
            if (node instanceof DataNode) {
                String wholeData = ((DataNode) node).getWholeData();
                if (wholeData != null) {
                    char[] charArray2 = wholeData.toCharArray();
                    try {
                        if (charArray2.length > 0) {
                            this.handler.characters(charArray2, 0, charArray2.length);
                        }
                    } catch (SAXException e2) {
                        throw new RuntimeSAXException(e2);
                    }
                }
                return NodeFilter.FilterResult.CONTINUE;
            }
            AttributesImpl attributesImpl = new AttributesImpl();
            Iterator<Attribute> it = node.attributes().iterator();
            while (it.hasNext()) {
                Attribute next = it.next();
                attributesImpl.addAttribute("", next.getKey(), next.getKey(), "", next.getValue());
            }
            try {
                this.handler.startElement("", node.nodeName(), node.nodeName(), attributesImpl);
                return NodeFilter.FilterResult.CONTINUE;
            } catch (SAXException e3) {
                throw new RuntimeSAXException(e3);
            }
        }

        @Override // org.jsoup.select.NodeFilter
        public NodeFilter.FilterResult tail(Node node, int i) {
            if ((node instanceof TextNode) || (node instanceof DataNode)) {
                return NodeFilter.FilterResult.CONTINUE;
            }
            try {
                this.handler.endElement("", node.nodeName(), node.nodeName());
                return NodeFilter.FilterResult.CONTINUE;
            } catch (SAXException e) {
                throw new RuntimeSAXException(e);
            }
        }
    }

    public JSoupParser() {
        this.extractScripts = false;
    }

    public JSoupParser(EncodingDetector encodingDetector) {
        super(encodingDetector);
        this.extractScripts = false;
    }

    @Override // org.apache.tika.parser.Parser
    public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
        return SUPPORTED_TYPES;
    }

    public boolean isExtractScripts() {
        return this.extractScripts;
    }

    @Field
    public void setExtractScripts(boolean z) {
        this.extractScripts = z;
    }

    @Override // org.apache.tika.parser.Parser
    public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException {
        Charset detect = getEncodingDetector(parseContext).detect(inputStream, metadata);
        Charset charset = detect == null ? DEFAULT_CHARSET : detect;
        String str = metadata.get("Content-Type");
        MediaType mediaType = null;
        if (str == null || str.startsWith("text/html")) {
            mediaType = new MediaType(MediaType.TEXT_HTML, charset);
        } else if (str.startsWith("application/xhtml+xml")) {
            mediaType = new MediaType(XHTML, charset);
        } else if (str.startsWith("application/vnd.wap.xhtml+xml")) {
            mediaType = new MediaType(WAP_XHTML, charset);
        } else if (str.startsWith("application/x-asp")) {
            mediaType = new MediaType(X_ASP, charset);
        }
        if (mediaType != null) {
            metadata.set("Content-Type", mediaType.toString());
        }
        metadata.set("Content-Encoding", charset.name());
        HtmlMapper htmlMapper = (HtmlMapper) parseContext.get(HtmlMapper.class, new DefaultHtmlMapper());
        Document parse = Jsoup.parse(CloseShieldInputStream.wrap(inputStream), charset.name(), "", Parser.htmlParser().tagSet(SELF_CLOSEABLE_TAGS));
        parse.quirksMode(Document.QuirksMode.quirks);
        XHTMLDowngradeHandler xHTMLDowngradeHandler = new XHTMLDowngradeHandler(new HtmlHandler(htmlMapper, contentHandler, metadata, parseContext, this.extractScripts));
        xHTMLDowngradeHandler.startDocument();
        try {
            try {
                NodeTraversor.filter(new TikaNodeFilter(xHTMLDowngradeHandler), parse);
                xHTMLDowngradeHandler.endDocument();
            } catch (RuntimeSAXException e) {
                throw e.getWrapped();
            }
        } catch (Throwable th) {
            xHTMLDowngradeHandler.endDocument();
            throw th;
        }
    }

    public void parseString(String str, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws SAXException {
        HtmlMapper htmlMapper = (HtmlMapper) parseContext.get(HtmlMapper.class, new DefaultHtmlMapper());
        Document parse = Jsoup.parse(str, Parser.htmlParser().tagSet(SELF_CLOSEABLE_TAGS));
        parse.quirksMode(Document.QuirksMode.quirks);
        XHTMLDowngradeHandler xHTMLDowngradeHandler = new XHTMLDowngradeHandler(new HtmlHandler(htmlMapper, contentHandler, metadata, parseContext, this.extractScripts));
        xHTMLDowngradeHandler.startDocument();
        try {
            try {
                NodeTraversor.filter(new TikaNodeFilter(xHTMLDowngradeHandler), parse);
                xHTMLDowngradeHandler.endDocument();
            } catch (RuntimeSAXException e) {
                throw e.getWrapped();
            }
        } catch (Throwable th) {
            xHTMLDowngradeHandler.endDocument();
            throw th;
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // org.apache.tika.parser.AbstractEncodingDetectorParser
    public EncodingDetector getEncodingDetector(ParseContext parseContext) {
        EncodingDetector encodingDetector = (EncodingDetector) parseContext.get(EncodingDetector.class);
        return encodingDetector != null ? encodingDetector : getEncodingDetector();
    }

    static {
        try {
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(JSoupParser.class.getResourceAsStream("self-closeable-tags.txt"), StandardCharsets.UTF_8));
            try {
                String readLine = bufferedReader.readLine();
                while (readLine != null) {
                    if (readLine.startsWith("#") || readLine.trim().isEmpty()) {
                        readLine = bufferedReader.readLine();
                    } else {
                        SELF_CLOSEABLE_TAGS.valueOf(readLine.trim(), "http://www.w3.org/1999/xhtml").set(Tag.SelfClose);
                        readLine = bufferedReader.readLine();
                    }
                }
                bufferedReader.close();
            } finally {
            }
        } catch (IOException e) {
            throw new RuntimeException("Can't find self-closeable-tags.txt");
        }
    }
}
