zhuyujia-webhopper/Tools/domain_extract.py


import tldextract


class Extracter:
    """
    extract the subdomain and check whether the resource is a third-parth
    ipt: a resource url
    opt: resource host, isThirdParty(0 False, 1 True)
    """
    def __init__(self, webiste):
        self.website = webiste
        self.host, self.domain = self.extract(self.website)

    @staticmethod
    def extract(url):
        subdomain, domain, suffix = tldextract.extract(url)
        if not subdomain:
            host = domain + "." + suffix
        else:
            host = subdomain + '.' + domain + '.' + suffix
        domain = domain + "." + suffix
        return host, domain

    def isThirdParty(self, url):
        if not url:
            return None, None
        isThirdParty = 0
        host, domain = self.extract(url)
        if domain != self.domain:
            isThirdParty = 1
        return host, isThirdParty