package me.angrybyte.goose.cleaners;

import com.bria.common.controller.remotedebug.RemoteDebugConstants;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.regex.Pattern;
import me.angrybyte.goose.texthelpers.ReplaceSequence;
import me.angrybyte.goose.texthelpers.string;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
import org.jsoup.select.Elements;
import org.simpleframework.xml.strategy.Name;

/* loaded from: classes4.dex */
public class DefaultDocumentCleaner implements DocumentCleaner {
    private static final Pattern divToPElementsPattern = Pattern.compile("<(a|blockquote|dl|div|img|ol|p|pre|table|ul)");
    private static final Pattern captionPattern = Pattern.compile("^caption$");
    private static final Pattern googlePattern = Pattern.compile(" google ");
    private static final Pattern entriesPattern = Pattern.compile("^[^entry-]more.*$");
    private static final Pattern facebookPattern = Pattern.compile("[^-]facebook");
    private static final Pattern twitterPattern = Pattern.compile("[^-]twitter");
    private static final String regExRemoveNodes = "^side$|combx|retweet|menucontainer|navbar|comment|PopularQuestions|contact|foot|footer|Footer|footnote|cnn_strycaptiontxt|links|meta$|scroll|shoutbox|sponsor|tags|socialnetworking|socialNetworking|cnnStryHghLght|cnn_stryspcvbx|^inset$|pagetools|post-attributes|welcome_form|contentTools2|the_answers|communitypromo|subscribe|vcard|articleheadings|date|print|popup|author-dropdown|tools|socialtools|byline|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text";
    private static final String queryNaughtyIDs = "[id~=(^side$|combx|retweet|menucontainer|navbar|comment|PopularQuestions|contact|foot|footer|Footer|footnote|cnn_strycaptiontxt|links|meta$|scroll|shoutbox|sponsor|tags|socialnetworking|socialNetworking|cnnStryHghLght|cnn_stryspcvbx|^inset$|pagetools|post-attributes|welcome_form|contentTools2|the_answers|communitypromo|subscribe|vcard|articleheadings|date|print|popup|author-dropdown|tools|socialtools|byline|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text)]";
    private static final String queryNaughtyClasses = "[class~=(^side$|combx|retweet|menucontainer|navbar|comment|PopularQuestions|contact|foot|footer|Footer|footnote|cnn_strycaptiontxt|links|meta$|scroll|shoutbox|sponsor|tags|socialnetworking|socialNetworking|cnnStryHghLght|cnn_stryspcvbx|^inset$|pagetools|post-attributes|welcome_form|contentTools2|the_answers|communitypromo|subscribe|vcard|articleheadings|date|print|popup|author-dropdown|tools|socialtools|byline|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text)]";
    private static final String queryNaughtyNames = "[name~=(^side$|combx|retweet|menucontainer|navbar|comment|PopularQuestions|contact|foot|footer|Footer|footnote|cnn_strycaptiontxt|links|meta$|scroll|shoutbox|sponsor|tags|socialnetworking|socialNetworking|cnnStryHghLght|cnn_stryspcvbx|^inset$|pagetools|post-attributes|welcome_form|contentTools2|the_answers|communitypromo|subscribe|vcard|articleheadings|date|print|popup|author-dropdown|tools|socialtools|byline|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text)]";
    private static final ReplaceSequence tabsAndNewLinesReplcesments = ReplaceSequence.create(RemoteDebugConstants.NEW_LINE, "\n\n").append("\t").append("^\\s+$");

    private Document cleanBadTags(Document document) {
        Elements children = document.body().children();
        Iterator<Element> it = children.select(queryNaughtyIDs).iterator();
        while (it.hasNext()) {
            removeNode(it.next());
        }
        Iterator<Element> it2 = children.select(queryNaughtyClasses).iterator();
        while (it2.hasNext()) {
            removeNode(it2.next());
        }
        Iterator<Element> it3 = children.select(queryNaughtyNames).iterator();
        while (it3.hasNext()) {
            removeNode(it3.next());
        }
        return document;
    }

    private Document cleanEmTags(Document document) {
        Iterator<Element> it = document.getElementsByTag("em").iterator();
        while (it.hasNext()) {
            Element next = it.next();
            if (next.getElementsByTag("img").size() == 0) {
                next.replaceWith(new TextNode(next.text(), document.baseUri()));
            }
        }
        return document;
    }

    private Document convertDivsToParagraphs(Document document, String str) {
        Iterator<Element> it = document.getElementsByTag(str).iterator();
        while (it.hasNext()) {
            Element next = it.next();
            try {
                if (divToPElementsPattern.matcher(next.html().toLowerCase()).find()) {
                    StringBuilder sb = new StringBuilder();
                    ArrayList arrayList = new ArrayList();
                    for (Node node : next.childNodes()) {
                        if (node.nodeName().equals("#text")) {
                            String attr = ((TextNode) node).attr("text");
                            if (!string.isNullOrEmpty(attr)) {
                                String replaceAll = tabsAndNewLinesReplcesments.replaceAll(attr);
                                if (replaceAll.length() > 1) {
                                    Node previousSibling = node.previousSibling();
                                    if (previousSibling != null && previousSibling.nodeName().equals("a")) {
                                        sb.append(previousSibling.outerHtml());
                                    }
                                    sb.append(replaceAll);
                                    arrayList.add(node);
                                }
                            }
                        }
                    }
                    Element createElement = new Document(document.baseUri()).createElement("p");
                    createElement.html(sb.toString());
                    next.childNode(0).before(createElement.outerHtml());
                    Iterator it2 = arrayList.iterator();
                    while (it2.hasNext()) {
                        ((Node) it2.next()).remove();
                    }
                } else {
                    Element createElement2 = new Document(document.baseUri()).createElement("p");
                    createElement2.append(next.html());
                    next.replaceWith(createElement2);
                }
            } catch (Exception unused) {
            }
        }
        return document;
    }

    private Document removeDropCaps(Document document) {
        Iterator<Element> it = document.select("span[class~=(dropcap|drop_cap)]").iterator();
        while (it.hasNext()) {
            Element next = it.next();
            next.replaceWith(new TextNode(next.text(), document.baseUri()));
        }
        return document;
    }

    private void removeNode(Element element) {
        if (element == null || element.parent() == null) {
            return;
        }
        element.remove();
    }

    private Document removeNodesViaRegEx(Document document, Pattern pattern) {
        try {
            Iterator<Element> it = document.getElementsByAttributeValueMatching("id", pattern).iterator();
            while (it.hasNext()) {
                removeNode(it.next());
            }
            Iterator<Element> it2 = document.getElementsByAttributeValueMatching(Name.LABEL, pattern).iterator();
            while (it2.hasNext()) {
                removeNode(it2.next());
            }
        } catch (IllegalArgumentException e) {
            e.printStackTrace();
        }
        return document;
    }

    private Document removeScriptsAndStyles(Document document) {
        Iterator<Element> it = document.getElementsByTag("script").iterator();
        while (it.hasNext()) {
            it.next().remove();
        }
        Iterator<Element> it2 = document.getElementsByTag("style").iterator();
        while (it2.hasNext()) {
            it2.next().remove();
        }
        return document;
    }

    @Override // me.angrybyte.goose.cleaners.DocumentCleaner
    public Document clean(Document document) {
        return convertDivsToParagraphs(convertDivsToParagraphs(removeNodesViaRegEx(removeNodesViaRegEx(removeNodesViaRegEx(removeNodesViaRegEx(removeNodesViaRegEx(cleanBadTags(removeScriptsAndStyles(removeDropCaps(cleanEmTags(document)))), captionPattern), googlePattern), entriesPattern), facebookPattern), twitterPattern), "div"), "span");
    }
}
