package com.mathworks.toolbox.textanalytics;

import com.google.common.base.Function;
import com.google.common.collect.Lists;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Vector;
import java.util.regex.Pattern;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.CDATASection;
import org.w3c.dom.Comment;
import org.w3c.dom.DOMConfiguration;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.Text;
import org.w3c.dom.ls.DOMImplementationLS;
import org.w3c.dom.ls.LSSerializer;
import org.xml.sax.InputSource;
import se.fishtank.css.selectors.Selectors;
import se.fishtank.css.selectors.dom.W3CNode;

/* loaded from: input_file:com/mathworks/toolbox/textanalytics/Html.class */
public class Html {
    public Node doc;
    private static final String WS_Regex = "(?U)\\s+";
    private static final HashSet<String> inlineElements = new HashSet<String>() { // from class: com.mathworks.toolbox.textanalytics.Html.2
        {
            add("A");
            add("ABBR");
            add("ACRONYM");
            add("B");
            add("BDI");
            add("BDO");
            add("BIG");
            add("BR");
            add("BUTTON");
            add("CITE");
            add("CODE");
            add("DATA");
            add("DEL");
            add("DFN");
            add("EM");
            add("FONT");
            add("I");
            add("IMG");
            add("INPUT");
            add("INS");
            add("KBD");
            add("LABEL");
            add("MAP");
            add("MARK");
            add("METER");
            add("OBJECT");
            add("PICTURE");
            add("PROGRESS");
            add("Q");
            add("RP");
            add("RT");
            add("RUBY");
            add("S");
            add("SAMP");
            add("SELECT");
            add("SMALL");
            add("SPAN");
            add("STRIKE");
            add("STRONG");
            add("SUB");
            add("SUP");
            add("TEXTAREA");
            add("TIME");
            add("TT");
            add("U");
            add("VAR");
            add("WBR");
        }
    };
    private static Pattern unlikelyCandidateRe;
    private static Pattern okMaybeCandidateRe;
    private static Pattern positiveScoreRe;
    private static Pattern negativeScoreRe;

    private Html(Node node) {
        this.doc = node;
    }

    public Html(String str) throws Exception {
        InputSource inputSource = new InputSource(IOUtils.toInputStream(str, "UTF-8"));
        DOMParser dOMParser = new DOMParser();
        dOMParser.setProperty("http://cyberneko.org/html/properties/default-encoding", "UTF-8");
        dOMParser.setFeature("http://cyberneko.org/html/features/scanner/cdata-sections", true);
        dOMParser.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset", true);
        dOMParser.setFeature("http://cyberneko.org/html/features/override-doctype", true);
        dOMParser.parse(inputSource);
        this.doc = dOMParser.getDocument().getDocumentElement();
    }

    public String toString() {
        return toString(true);
    }

    public String toString(boolean z) {
        switch (this.doc.getNodeType()) {
            case 3:
                return ((Text) this.doc).getNodeValue();
            case 4:
                return "<![CDATA[" + ((CDATASection) this.doc).getNodeValue() + "]]>";
            case 8:
                return "<!--" + ((Comment) this.doc).getNodeValue() + "-->";
            default:
                try {
                    LSSerializer createLSSerializer = ((DOMImplementationLS) this.doc.getOwnerDocument().getImplementation()).createLSSerializer();
                    DOMConfiguration domConfig = createLSSerializer.getDomConfig();
                    domConfig.setParameter("format-pretty-print", Boolean.valueOf(z));
                    domConfig.setParameter("xml-declaration", false);
                    return createLSSerializer.writeToString(this.doc).trim();
                } catch (Exception e) {
                    return super.toString();
                }
        }
    }

    public String getName() {
        return this.doc.getNodeName();
    }

    public String getAttribute(String str) {
        if (this.doc.getNodeType() == 1 && ((Element) this.doc).hasAttribute(str)) {
            return ((Element) this.doc).getAttribute(str);
        }
        return null;
    }

    public List<Html> getChildren() {
        NodeList childNodes = this.doc.getChildNodes();
        Vector vector = new Vector();
        for (int i = 0; i < childNodes.getLength(); i++) {
            vector.add(new Html(childNodes.item(i)));
        }
        return vector;
    }

    public Html getParent() {
        Node parentNode = this.doc.getParentNode();
        if (parentNode == null || parentNode.getNodeType() != 1) {
            return null;
        }
        return new Html(parentNode);
    }

    public List<Html> selectByCSS(String str) throws Exception {
        return Lists.transform(new Selectors(new W3CNode(this.doc)).querySelectorAll(str), new Function<Node, Html>() { // from class: com.mathworks.toolbox.textanalytics.Html.1
            public Html apply(Node node) {
                return new Html(node);
            }
        });
    }

    public List<Html> selectByXPath(String str) throws Exception {
        new W3CNode(this.doc);
        NodeList nodeList = (NodeList) XPathFactory.newInstance().newXPath().evaluate(str, this.doc, XPathConstants.NODESET);
        Vector vector = new Vector();
        for (int i = 0; i < nodeList.getLength(); i++) {
            vector.add(new Html(nodeList.item(i)));
        }
        return vector;
    }

    /* JADX WARN: Removed duplicated region for block: B:14:0x00cd  */
    /*
        Code decompiled incorrectly, please refer to instructions dump.
        To view partially-correct add '--show-bad-code' argument
    */
    public java.lang.String getXPath() {
        /*
            Method dump skipped, instructions count: 336
            To view this dump add '--comments-level debug' option
        */
        throw new UnsupportedOperationException("Method not decompiled: com.mathworks.toolbox.textanalytics.Html.getXPath():java.lang.String");
    }

    public String extractAllText() throws Exception {
        if (this.doc.getNodeType() != 1) {
            return handleParagraphs(this.doc);
        }
        Element element = (Element) this.doc;
        NodeList elementsByTagName = element.getElementsByTagName("BODY");
        return handleParagraphs(elementsByTagName.getLength() < 1 ? element : (Element) elementsByTagName.item(0));
    }

    public String extractTextEatiht() throws Exception {
        XPath newXPath = XPathFactory.newInstance().newXPath();
        Element element = (Element) this.doc;
        NodeList elementsByTagName = element.getElementsByTagName("BODY");
        NodeList nodeList = (NodeList) newXPath.evaluate(".//*[not(self::SCRIPT or self::STYLE or self::I or self::B or self::STRONG or self::SPAN or self::A)]/text()[string-length(normalize-space()) > 20]/..", elementsByTagName.getLength() < 1 ? element : (Element) elementsByTagName.item(0), XPathConstants.NODESET);
        HashMap hashMap = new HashMap();
        Node node = null;
        int i = 0;
        for (int i2 = 0; i2 < nodeList.getLength(); i2++) {
            Node item = nodeList.item(i2);
            String[] splitRoughlyIntoSentences = splitRoughlyIntoSentences(item.getTextContent());
            Node parentNode = item.getParentNode();
            int length = splitRoughlyIntoSentences.length;
            if (hashMap.containsKey(parentNode)) {
                length += ((Integer) hashMap.get(parentNode)).intValue();
            }
            if (length > i) {
                i = length;
                node = parentNode;
            }
            hashMap.put(parentNode, Integer.valueOf(length));
        }
        return node == null ? extractAllText() : handleParagraphs(node);
    }

    public String extractTextTreeAnalysis() throws Exception {
        Node guessMainBody = guessMainBody();
        if (guessMainBody.getNodeType() != 1) {
            return handleParagraphs(guessMainBody);
        }
        Element cleanUpDivs = cleanUpDivs(discardUnlikelyCandidates(discardJunkElements((Element) guessMainBody.cloneNode(true))));
        return handleParagraphs(findBestCandidate(appraiseParagraphs(cleanUpDivs), cleanUpDivs));
    }

    private static Element removeElements(Element element, String str) {
        NodeList elementsByTagName = element.getElementsByTagName(str);
        for (int i = 0; i < elementsByTagName.getLength(); i++) {
            Element element2 = (Element) elementsByTagName.item(i);
            element2.getParentNode().removeChild(element2);
        }
        return element;
    }

    private static String[] splitRoughlyIntoSentences(String str) {
        return str.split("[.?!。]");
    }

    private static String handleParagraphs(Node node) throws Exception {
        StringBuilder sb = new StringBuilder();
        handleParagraphs(node, sb, false);
        return sb.toString();
    }

    private static boolean handleParagraphs(Node node, StringBuilder sb, boolean z) throws Exception {
        switch (node.getNodeType()) {
            case 1:
            case 9:
                String nodeName = node.getNodeName();
                boolean z2 = -1;
                switch (nodeName.hashCode()) {
                    case -1854356277:
                        if (nodeName.equals("SCRIPT")) {
                            z2 = false;
                            break;
                        }
                        break;
                    case 80:
                        if (nodeName.equals("P")) {
                            z2 = 12;
                            break;
                        }
                        break;
                    case 2128:
                        if (nodeName.equals("BR")) {
                            z2 = 4;
                            break;
                        }
                        break;
                    case 2281:
                        if (nodeName.equals("H1")) {
                            z2 = 5;
                            break;
                        }
                        break;
                    case 2282:
                        if (nodeName.equals("H2")) {
                            z2 = 6;
                            break;
                        }
                        break;
                    case 2283:
                        if (nodeName.equals("H3")) {
                            z2 = 7;
                            break;
                        }
                        break;
                    case 2284:
                        if (nodeName.equals("H4")) {
                            z2 = 8;
                            break;
                        }
                        break;
                    case 2285:
                        if (nodeName.equals("H5")) {
                            z2 = 9;
                            break;
                        }
                        break;
                    case 2286:
                        if (nodeName.equals("H6")) {
                            z2 = 10;
                            break;
                        }
                        break;
                    case 2429:
                        if (nodeName.equals("LI")) {
                            z2 = 11;
                            break;
                        }
                        break;
                    case 2622:
                        if (nodeName.equals("RP")) {
                            z2 = 2;
                            break;
                        }
                        break;
                    case 2626:
                        if (nodeName.equals("RT")) {
                            z2 = 3;
                            break;
                        }
                        break;
                    case 67697:
                        if (nodeName.equals("DIV")) {
                            z2 = 13;
                            break;
                        }
                        break;
                    case 79491:
                        if (nodeName.equals("PRE")) {
                            z2 = 14;
                            break;
                        }
                        break;
                    case 79242641:
                        if (nodeName.equals("STYLE")) {
                            z2 = true;
                            break;
                        }
                        break;
                }
                switch (z2) {
                    case false:
                    case true:
                    case true:
                    case true:
                        return z;
                    case true:
                        if (z) {
                            sb.append("\n\n");
                            return false;
                        }
                        sb.append("\n");
                        return false;
                    case true:
                    case true:
                    case true:
                    case true:
                    case true:
                    case true:
                    case true:
                    case true:
                    case true:
                        if (sb.length() > 0) {
                            sb.append("\n\n");
                        }
                        boolean z3 = false;
                        Node firstChild = node.getFirstChild();
                        while (true) {
                            Node node2 = firstChild;
                            if (node2 == null) {
                                return true;
                            }
                            z3 = handleParagraphs(node2, sb, z3);
                            firstChild = node2.getNextSibling();
                        }
                    case true:
                        if (sb.length() > 0) {
                            sb.append("\n\n");
                        }
                        sb.append(node.getTextContent());
                        return true;
                    default:
                        Node firstChild2 = node.getFirstChild();
                        while (true) {
                            Node node3 = firstChild2;
                            if (node3 == null) {
                                break;
                            } else {
                                z = handleParagraphs(node3, sb, z);
                                firstChild2 = node3.getNextSibling();
                            }
                        }
                }
            case 3:
            case 4:
                if (z) {
                    sb.append("\n\n");
                }
                sb.append(node.getTextContent().replaceAll(WS_Regex, " "));
                return false;
        }
        return z;
    }

    private static int strLength(Node node) {
        return node.getTextContent().replaceAll(WS_Regex, " ").trim().length();
    }

    private static double linkDensity(Element element) {
        double strLength = strLength(element);
        double d = 0.0d;
        for (int i = 0; i < element.getElementsByTagName("A").getLength(); i++) {
            d += strLength(r0.item(i));
        }
        return d / strLength;
    }

    private static boolean unlikelyCandidate(Element element) {
        if (unlikelyCandidateRe == null) {
            unlikelyCandidateRe = Pattern.compile("a/expand|a/collapse|comment|dialog|disqus|foot|header|menu|meta|nav|rss|shoutbox|sidebar|sponsor", 2);
        }
        if (okMaybeCandidateRe == null) {
            okMaybeCandidateRe = Pattern.compile("article|body|content|^entry|^page|pagination|post|text", 2);
        }
        String attribute = element.getAttribute("ID");
        String str = element.getNodeName() + (attribute != null ? "/" + attribute : "");
        return unlikelyCandidateRe.matcher(str).find() && !okMaybeCandidateRe.matcher(str).find();
    }

    private int classScore(Element element) {
        int i = 0;
        String nodeName = element.getNodeName();
        boolean z = -1;
        switch (nodeName.hashCode()) {
            case -1606743355:
                if (nodeName.equals("SECTION")) {
                    z = false;
                    break;
                }
                break;
            case -429709356:
                if (nodeName.equals("ADDRESS")) {
                    z = 5;
                    break;
                }
                break;
            case 2176:
                if (nodeName.equals("DD")) {
                    z = 9;
                    break;
                }
                break;
            case 2184:
                if (nodeName.equals("DL")) {
                    z = 8;
                    break;
                }
                break;
            case 2192:
                if (nodeName.equals("DT")) {
                    z = 10;
                    break;
                }
                break;
            case 2429:
                if (nodeName.equals("LI")) {
                    z = 11;
                    break;
                }
                break;
            case 2525:
                if (nodeName.equals("OL")) {
                    z = 6;
                    break;
                }
                break;
            case 2672:
                if (nodeName.equals("TD")) {
                    z = 3;
                    break;
                }
                break;
            case 2711:
                if (nodeName.equals("UL")) {
                    z = 7;
                    break;
                }
                break;
            case 67697:
                if (nodeName.equals("DIV")) {
                    z = true;
                    break;
                }
                break;
            case 79491:
                if (nodeName.equals("PRE")) {
                    z = 2;
                    break;
                }
                break;
            case 2163908:
                if (nodeName.equals("FORM")) {
                    z = 12;
                    break;
                }
                break;
            case 1788294671:
                if (nodeName.equals("BLOCKQUOTE")) {
                    z = 4;
                    break;
                }
                break;
        }
        switch (z) {
            case false:
                i = 0 + 7;
            case true:
                i += 5;
            case true:
            case true:
            case true:
                i += 3;
            case true:
            case true:
            case true:
            case true:
            case true:
            case true:
            case true:
            case true:
                i -= 3;
                break;
        }
        if (positiveScoreRe == null) {
            positiveScoreRe = Pattern.compile("article|body|content|entry|hentry|main|page|pagination|post|text|blog|story", 2);
        }
        if (negativeScoreRe == null) {
            negativeScoreRe = Pattern.compile("combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget", 2);
        }
        if (positiveScoreRe.matcher(element.getNodeName()).find()) {
            i += 25;
        }
        if (negativeScoreRe.matcher(element.getNodeName()).find()) {
            i -= 25;
        }
        String attribute = element.getAttribute("ID");
        if (attribute != null) {
            if (positiveScoreRe.matcher(attribute).find()) {
                i += 25;
            }
            if (negativeScoreRe.matcher(attribute).find()) {
                i -= 25;
            }
        }
        return i;
    }

    private HashMap<Element, Integer> appraiseParagraphs(Element element) {
        HashMap<Element, Integer> hashMap = new HashMap<>();
        NodeList elementsByTagName = element.getElementsByTagName("P");
        for (int i = 0; i < elementsByTagName.getLength(); i++) {
            Element element2 = (Element) elementsByTagName.item(i);
            Element element3 = (Element) element2.getParentNode();
            if (element3 != null) {
                Element element4 = (Element) element3.getParentNode();
                String textContent = element2.getTextContent();
                if (!hashMap.containsKey(element3)) {
                    hashMap.put(element3, Integer.valueOf(classScore(element3)));
                }
                if (element4 != null && !hashMap.containsKey(element4)) {
                    hashMap.put(element4, Integer.valueOf(classScore(element4)));
                }
                int countMatches = (int) (1 + StringUtils.countMatches(textContent, ",") + Math.min(Math.floor(strLength(element2) / 100), 3.0d));
                hashMap.put(element3, Integer.valueOf(hashMap.get(element3).intValue() + countMatches));
                if (element4 != null) {
                    hashMap.put(element4, Integer.valueOf(hashMap.get(element4).intValue() + (countMatches / 2)));
                }
            }
        }
        return hashMap;
    }

    private Node guessMainBody() {
        if (this.doc.getNodeType() != 1) {
            return this.doc;
        }
        Element element = (Element) this.doc;
        NodeList elementsByTagName = element.getElementsByTagName("BODY");
        Element element2 = elementsByTagName.getLength() < 1 ? element : (Element) elementsByTagName.item(0);
        NodeList elementsByTagName2 = element2.getElementsByTagName("MAIN");
        if (elementsByTagName2.getLength() == 1) {
            element2 = (Element) elementsByTagName2.item(0);
        }
        NodeList elementsByTagName3 = element2.getElementsByTagName("ARTICLE");
        if (elementsByTagName3.getLength() == 1) {
            element2 = (Element) elementsByTagName3.item(0);
        }
        return element2;
    }

    private Element discardJunkElements(Element element) {
        return removeElements(removeElements(removeElements(removeElements(removeElements(removeElements(removeElements(removeElements(removeElements(removeElements(removeElements(removeElements(removeElements(removeElements(removeElements(removeElements(removeElements(element, "APPLET"), "AUDIO"), "EMBED"), "FOOTER"), "FORM"), "IFRAME"), "IMG"), "MAP"), "MENU"), "META"), "NAV"), "OBJECT"), "PICTURE"), "RT"), "SCRIPT"), "STYLE"), "SVG");
    }

    private Element discardUnlikelyCandidates(Element element) {
        NodeList elementsByTagName = element.getElementsByTagName("*");
        for (int i = 0; i < elementsByTagName.getLength(); i++) {
            Element element2 = (Element) elementsByTagName.item(i);
            if (unlikelyCandidate(element2)) {
                element2.getParentNode().removeChild(element2);
            }
        }
        return element;
    }

    private Element cleanUpDivs(Element element) {
        NodeList elementsByTagName = element.getElementsByTagName("DIV");
        for (int i = 0; i < elementsByTagName.getLength(); i++) {
            Element element2 = (Element) elementsByTagName.item(i);
            if (element2.getChildNodes().getLength() == 1) {
                element2.getParentNode().replaceChild(element2.getFirstChild(), element2);
            } else {
                boolean z = true;
                Node firstChild = element2.getFirstChild();
                while (true) {
                    Node node = firstChild;
                    if (!z || node == null) {
                        break;
                    }
                    z = inlineElements.contains(node.getNodeName());
                    firstChild = node.getNextSibling();
                }
                if (z) {
                    this.doc.getOwnerDocument().renameNode(element2, null, "P");
                }
            }
        }
        return element;
    }

    private Element findBestCandidate(HashMap<Element, Integer> hashMap, Element element) {
        Element element2 = element;
        double d = 0.0d;
        for (Map.Entry<Element, Integer> entry : hashMap.entrySet()) {
            double intValue = entry.getValue().intValue() * (1.0d - linkDensity(entry.getKey()));
            if (intValue > d) {
                element2 = entry.getKey();
                d = intValue;
            } else if (intValue == d) {
                Element key = entry.getKey();
                int length = element2.getTextContent().length();
                int length2 = key.getTextContent().length();
                if (length2 > length) {
                    element2 = key;
                } else if (length2 == length) {
                    try {
                        String handleParagraphs = handleParagraphs(element2);
                        String handleParagraphs2 = handleParagraphs(key);
                        if (handleParagraphs2.length() > handleParagraphs.length() || handleParagraphs2.compareTo(handleParagraphs) > 0) {
                            element2 = key;
                        }
                    } catch (Exception e) {
                    }
                }
            }
        }
        return element2;
    }
}
