来源:python中国网 时间:2020-02-12

  SEOer判断一个行业哪些站做的好/流量大的做法

  1、找一批行业词库,抓取每个词百度排名前10的url。假设1万个词那么得到10万个url;

  2、从10万个url中提取域名,统计下各个域名出现次数;

  3、某域名的出现次数/10万 来计算各域名首页覆盖率;

  4、覆盖率高的则为高流量站点。

  【ps:如果长期记录统计,观察发现哪些站降权,哪些站暴涨】。

  百度反爬提示

  会封UA、封cookie、封ip比较少见。

  最恶心的是爬虫得来的页面和实际搜索的页面不同

  因为直接拼接搜索urL进行访问,没有用户真实的鼠标动作无法触发js请求!所以容易被识别。不加cookie抓取量大的话就会发现虽然是正常的但是页面内容和实际搜索不同。

  脚本功能(集域名覆盖率+目标域名首页词词数为一体):

  1)指定域名,分关键词种类监控目标站点首页词数

  2)采集serp所有url,提取域名并统计各域名首页覆盖率

  脚本规则:

  1)相关网站.相关企业.智能小程序.其他人还在搜.热议聚合.资讯聚合.搜索智能聚合.视频全部算在内

  所以首页排名有可能出现大于10

  2)serp上自然排名mu属性值为排名url,特殊样式mu为空或不存在,

  提取article里url,该url是baidu域名,二次访问才能获得真实url,本脚本直接取baidu链接

  3)2020xiaoqu_kwd_city.xlsx:sheet名为关键词种类,sheet第一列放关键词

  脚本结果:

  bdmo1_index_info.txt:各监控站点词的排名及url,如有2个url排名,只取第一个

  bdmo1_index_all.txt:serp所有url及样式特征,依此统计各域名首页覆盖率-单写脚本(bdmo1_tj.py)完成

  bdmo1_index.xlsx:自己站每类词首页词数

  bdmo1_index_domains.xlsx:各监控站点每类词的首页词数

  bdmo1_index_domains.txt:各监控站点每类词的首页词数

  cookie必须是登录baidu账号后的cookie否则很容易被反爬

# ‐*‐ coding: utf‐8 ‐*‐
"""
功能:
   1)指定域名,分关键词种类监控首页词数
   2)采集serp所有url,提取域名并统计各域名首页覆盖率
提示:
  1)相关网站.相关企业.智能小程序.其他人还在搜.热议聚合.资讯聚合.搜索智能聚合.视频全部算在内
    所以首页排名有可能大于10
  2)serp上自然排名mu属性值为排名url,特殊样式mu为空或不存在,
    提取article里url,该url是baidu域名,二次访问才能获得真实url,本脚本直接取baidu链接
  3)2020xiaoqu_kwd_city.xlsx:sheet名为关键词种类,sheet第一列放关键词
结果:
    bdmo1_index_info.txt:各监控站点词的排名及url,如有2个url排名,只取第一个
    bdmo1_index_all.txt:serp所有url及样式特征,依此统计各域名首页覆盖率-单写脚本完成
    bdmo1_index.xlsx:自己站每类词首页词数
    bdmo1_index_domains.xlsx:各监控站点每类词的首页词数
    bdmo1_index_domains.txt:各监控站点每类词的首页词数
cookie必须是登录baidu账号后的cookie否则很容易被反爬

"""

import requests
from pyquery import PyQuery as pq
import threading
import queue
import time
from urllib.parse import urlparse
from openpyxl import load_workbook
from openpyxl import Workbook
import time
import gc
import json
import random


# 计算最终结果
def get_result(file_path, result):
    for line in open(file_path, 'r', encoding='utf-8'):
        line = line.strip().split('	')
        rank = line[2]
        group = line[3]
        domain = line[4]
        if rank != '无':
            result[domain][group]['首页'] += 1
        result[domain][group]['总词数'] += 1
    return result


# 写txt,所有监控域名的结果
def write_domains_txt(result_last):
    with open('{0}bdmo1_index_domains.txt'.format(today), 'w', encoding="utf-8") as f_res:
        f_res.write('{0}	{1}	{2}	{3}	{4}
'.format('日期','域名','词类','首页词数','查询词数'))
        for now_domain,dict_value in result_last.items():
            for group, dict_index_all in dict_value.items():
                f_res.write('{0}	{1}	{2}	'.format(today,now_domain,group))
                for key, value in dict_index_all.items():
                    f_res.write(str(value) + '	')
                f_res.write('
')


# 写excel
def write_myexcel(group_list, result_last, today,my_domain):
    wb = Workbook()
    wb_all = Workbook()
    # 创建sheet写表头
    for group in group_list:
        sheet_num = 0
        wb.create_sheet(u'{0}'.format(group), index=sheet_num)
        wb_all.create_sheet(u'{0}'.format(group), index=sheet_num)
        row_first = ['日期', '首页', '总词数']
        row_first2 = ['日期', '域名','首页', '总词数']
        # 写表头
        wb[group].append(row_first)
        wb_all[group].append(row_first2)
        sheet_num += 1
    # 写内容
    for domain, dict_value in result_last.items():
        if domain == my_domain:
            for group, dict_index_all in dict_value.items():
                # 写数据
                row_value = [today]
                for key,value in dict_index_all.items():
                    row_value.append(value)
                wb[u'{0}'.format(group)].append(row_value)

        for group, dict_index_all in dict_value.items():
            # 写数据
            row_value = [today,domain]
            for key, value in dict_index_all.items():
                row_value.append(value)
            wb_all[u'{0}'.format(group)].append(row_value)
    wb.save('{0}bdmo1_index.xlsx'.format(today))
    wb_all.save('{0}bdmo1_index_domains.xlsx'.format(today))

# 发js包(先不用)
def request_js(url,my_header,retry=1):
    try:
        r = requests.get(url=url,headers=my_header,timeout=2)
    except Exception as e:
        print('获取源码失败',e)
        time.sleep(6)
        if retry > 0:
            request_js(url,retry-1)
    else:
        pass

# 随机cookie
def get_header(cookies):
    now_cookie = random.choice(cookies)
    my_header = {
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding':'deflate',
'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8',
'Cache-Control':'no-cache',
'Connection':'keep-alive',
'Cookie':now_cookie,
'Host':'m.baidu.com',
'Pragma':'no-cache',
'Referer':'https://m.baidu.com/',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'same-origin',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1',}
    return my_header


class bdmoIndexMonitor(threading.Thread):

    def __init__(self):
        threading.Thread.__init__(self)

    @staticmethod
    def read_excel(filepath):
        q = queue.Queue()
        group_list = []
        kwd_dict = {}
        wb_kwd = load_workbook(filepath)
        for sheet_obj in wb_kwd:
            sheet_name = sheet_obj.title
            group_list.append(sheet_name)
            kwd_dict[sheet_name]= []
            col_a = sheet_obj['A']
            for cell in col_a:
                kwd = (cell.value)
                # 加个判断吧
                if kwd:
                    q.put([sheet_name,kwd])
        return q, group_list

    # 初始化结果字典
    @staticmethod
    def result_init(group_list):
        result = {}
        for domain in domains:
            result[domain] = {}
            for group in group_list:
                result[domain][group] = {'首页':0,'总词数':0}
        print("结果字典init...")
        return result

    # 获取某词serp源码
    def get_html(self,url,my_header,retry=1):
        try:
            r = requests.get(url=url,headers=my_header,timeout=5)
        except Exception as e:
            print('获取源码失败',e)
            time.sleep(6)
            if retry > 0:
                self.get_html(url,my_header,retry-1)
        else:
            html = r.content.decode('utf-8',errors='ignore')  # 用r.text有时候识别错误
            url = r.url  # 反爬会重定向,取定向后的地址
            return html,url

    # 获取某词的serp源码上包含排名url的div块
    def get_divs(self, html ,url):
        div_list = []
        doc = pq(html)
        title = doc('title').text()
        if '- 百度' in title and 'https://m.baidu.com/s?ie=utf-8' in url:
            try:
                div_list = doc('.c-result').items()
            except Exception as e:
                print('提取div块失败', e)
            else:
                pass
        else:
            print('源码异常---------------------',title)
            time.sleep(120)
        return div_list

    # 提取排名的真实url
    def get_real_urls(self, div_list):
        real_urls_rank = []
        if div_list:
            for div in div_list:
                data_log = div.attr('data-log')
                data_log = json.loads(data_log.replace("'", '"')) # json字符串双引号
                srcid = data_log['ensrcid'] if 'ensrcid' in data_log  else 'ensrcid' # 样式特征
                rank_url = data_log['mu'] # mu为空或者不存在
                rank = data_log['order']
                # 如果mu为空,.c-result-content里面提取url
                if rank_url:
                    real_urls_rank.append((rank_url,rank,srcid))
                else:
                    article = div('.c-result-content article')
                    link = article.attr('rl-link-href')
                    real_urls_rank.append((link,rank,srcid))
        return real_urls_rank

    # 提取某url的域名部分
    def get_domain(self,real_url):
        domain = None
        try:
           res = urlparse(real_url)
        except Exception as e:
           print (e,real_url)
        else:
           domain = res.netloc
        return domain

    # 获取某词serp源码首页排名所有域名
    def get_domains(self,real_url_list):
            domain_list = [self.get_domain(real_url) for real_url in real_url_list]
            # 一个词某域名多个url有排名,算一次
            domain_set = set(domain_list)
            domain_set = domain_set.remove(None) if None in domain_set else domain_set
            domain_str = ','.join(domain_set)
            return domain_str

    # 线程函数
    def run(self):
        while 1:
            js_url = 'https://fclick.baidu.com/w.gif?baiduid=14E9731020ACEE14821E1A67DABB2862&asp_time=1581297830764&query={0}&queryUtf8={1}&searchid=a0bc28b872b56b7e&osid=1&bwsid=5&adt=0&adb=0&wst=146&top=0&wise=10&middle=0&bottom=0&adpos=t_0_0.00&pbt=146&yxh=0&zoom=1.0555555555555556&validHeight=521&initViewZone=w_1_0.00%3Aw_2_1.00&adsHeight=_w1%3A255_w2%3A255_w3%3A487_w4%3A228_w5%3A204_w6%3A165_w7%3A189_w8%3A255_w9%3A151_w10%3A103&adsCmatch=&availHeight=667&availWidth=375&winHeight=667&winWidth=375&action=init&model=%7B%22vt%22%3A%22w1%3A0%23w2%3A0%23w3%3A0%23w4%3A0%23w5%3A0%23w6%3A0%23w7%3A0%23w8%3A0%23w9%3A0%23w10%3A0%22%2C%22pt%22%3A%22%22%2C%22ext%22%3A%5B%5D%2C%22vsh%22%3A521%2C%22asid%22%3A%22%22%2C%22rd%22%3A1581297833317%7D&tag=ecom_wise_listen_n&rand=1581297833325.636'
            group_kwd = q.get()
            group,kwd = group_kwd
            print(group,kwd)
            try:
                url = "https://m.baidu.com/s?ie=utf-8&word={0}".format(kwd)
                print(url)
                js_url = js_url.format(kwd,kwd)
                # print(js_url)
                my_header = get_header(cookies)
                # request_js(js_url,my_header)
                html,now_url = self.get_html(url,my_header)
                divs_res = self.get_divs(html,now_url)
                # 源码ok再写入
                if divs_res:
                    real_urls_rank = self.get_real_urls(divs_res)
                    real_urls = []
                    for my_url,my_order,my_attr in real_urls_rank:
                        real_urls.append(my_url)
                        f_all.write('{0}	{1}	{2}	{3}	{4}
'.format(kwd,my_url,my_order,my_attr,group))
                    f_all.flush()
                    domain_str = self.get_domains(real_urls)
                    # 目标站点是否出现
                    for domain in domains:
                        if domain not in domain_str:
                              f.write('{0}	{1}	{2}	{3}	{4}
'.format(kwd, '无', '无', group,domain))
                        else:
                            for my_url,my_order,my_attr in real_urls_rank:
                                if domain in my_url:
                                    f.write('{0}	{1}	{2}	{3}	{4}
'.format(kwd,my_url,my_order,group,domain))
                                    print(my_url, my_order)
                                    break # 取第一个排名url
                f.flush()
            except Exception as e:
                print(e)
            finally:
                del kwd
                gc.collect()
                q.task_done()
                

if __name__ == "__main__":
    start = time.time()
    local_time = time.localtime()
    today = time.strftime('%Y%m%d',local_time)
    domains = ['5i5j.com','lianjia.com','anjuke.com','fang.com'] # 目标域名
    my_domain = '5i5j.com' # 自己域名
    cookie1 = 'BDICON=10123156; wpr=0; ___rl__test__cookies=1581326691341; __cfduid=db1889b7d4272171df5d2b0ed76dbdacc1562148359; BDUSS=3IxU3dKbkhZb21TbkM0UkVYWC1XYldlb3RNMVM3bXV4dExqS1hVa3dLQzlRVXBlRVFBQUFBJCQAAAAAAAAAAAEAAAAGtiZkNzcyNDgzMjAwZG9uZwAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAL20Il69tCJeS; BAIDUID=14E9731020ACEE14821E1A67DABB2862:FG=1; MSA_ZOOM=1056; PSTM=1580184319; BIDUPSID=42DF5CED7B3ED990A9FF7BF52F7B4E0B; plus_lsv=f197ee21ffd230fd; plus_cv=1::m:49a3f4a6; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDICON=10123156; ysm=10303|10225; MSA_WH=360_640; COOKIE_SESSION=5_0_0_5_1_w6_17_5_9_0_0_4_71_1581302047%7C5%230_0_0_0_0_0_0_0_1581297718%7C1; FC_MODEL=-1_0_3_0_4.22_0_1_0_0_0_7.05_-1_9_29_5_49_0_1581302142088_1581302047543%7C9%234.22_-1_-1_9_5_1581302142088_1581302047543%7C9; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a03313377111; delPer=0; SE_LAUNCH=5%3A26355444; BDPASSGATE=IlPT2AEptyoA_yiU4VKH3kIN8efjWvW4AfvESkplQFStfCaWmhH3BrUzWz0HSieXBDP6wZTXdMsDxXTqXlVXa_EqnBsZolpOaSaXzKGoucHtVM69-t5yILXoHUE2sA8PbRhL-3MEF2ZELlQvcgjchQZrchW8z3JTpxz1z5Xocc0T1UKR2VLJxJyTS7xvRHvcPNuz94rXnEpKKSmBUADHRVjYcSQyWXkD5NOtjsAm1Q0WrkoXGurSRvAa1G8vJpFeXAio1fWU60ul269v5HViViwh9UOI7u46MnJZ; ASUV=1.2.126; PSINO=1; MSA_PBT=146; H_PS_PSSID=1428_21118_30725_26350; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; H_WISE_SIDS=128070_141000_128700_139420_141694_139560_135847_141753_140201_139297_136863_138585_141650_138252_140114_140324_140579_133847_140793_140065_131423_141707_141678_107314_139882_140368_140798_137703_141104_110085_138450_140140_138878_137985_141200_140173_131246_137749_138165_138883_127969_139135_140622_140593_140864_138426_138944_140682_141190; rsv_i=a5e3IBVWUKqcypPGCWrSPDPcpgeGOAAo3xmhylFhucdzMXyM9coWDZnOM5jAMFUByy07DOHHNowUGJiH1e9P2p7upHlen3I; BDSVRTM=432; BDSVRBFE=Go; Hm_lvt_12423ecbc0e2ca965d84259063d35238=1581298781,1581301518,1581326682,1581329557; Hm_lpvt_12423ecbc0e2ca965d84259063d35238=1581329557; ___rl__test__cookies=1581329559195; OUTFOX_SEARCH_USER_ID_NCOO=325652931.79790616; wise_tj_ub=ci%4072_10_-1_-1_-1_-1_-1%7Ciq%4043_1_21_249%7Ccb%40-1_-1_-1_-1_-1_-1_-1%7Cce%401%7Ctse%401; __bsi=10579770543302101918_00_14_R_R_15_0303_c02f_Y'
    cookie2 = 'wpr=0; BIDUPSID=A96B8FDEBE3F28F791FEF4BE28060F3D; PSTM=1565010185; BAIDUID=7ED8824009DCA5F0A424742D653565B5:FG=1; BDUSS=mh2b3BDUEJ3dWZJa0lERjhpczY0WUJmNnFyaVRZYm12aFFhZmJPY3Nla2pnWDlkRVFBQUFBJCQAAAAAAAAAAAEAAADag5oxzI2IkAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACP0V10j9FddT; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; H_WISE_SIDS=114550_135847_128065_141000_138596_122159_141754_140202_140592_138585_141651_138252_140113_136196_140325_140578_133847_140792_140065_131423_107319_139883_140913_140351_140797_140968_136413_110085_138946_138903_138878_137985_141200_140174_131247_132552_141261_138165_138883_140260_127969_140621_140593_140864_138425_138944_141190_140597; plus_lsv=f197ee21ffd230fd; plus_cv=1::m:49a3f4a6; ysm=11410|11321; MSA_WH=360_640; MSA_PBT=146; MSA_ZOOM=1056; COOKIE_SESSION=303_0_0_3_1_t1_5_1_3_0_0_1_19_1581299397%7C3%230_0_0_0_0_0_0_0_1581299056%7C1; FC_MODEL=-1_0_3_0_0_0_0_0_0_0_0_-1_1_5_1_6_0_1581300039148_1581299397896%7C3%230_-1_-1_1_1_1581300039148_1581299397896%7C3; H_PS_PSSID=1444_21097_26350; rsv_i=56a2YVge%2FrpXYupBhl%2FpfMP4i4OOKlDL7zQgClgCjq8B9agBV%2FaI1mSPIK8hDfLNGApwbu0tnbggPFE%2F5dqtrNnJxkisVo8; FEED_SIDS=16396_0210_16; BDSVRTM=340; BDSVRBFE=Go; Hm_lvt_12423ecbc0e2ca965d84259063d35238=1581299053,1581327126; Hm_lpvt_12423ecbc0e2ca965d84259063d35238=1581327126; SE_LAUNCH=5%3A26355452; wise_tj_ub=ci%40132_8_-1_-1_-1_-1_-1%7Ciq%404_1_1_427%7Ccb%40-1_-1_-1_-1_-1_-1_-1%7Cce%401%7Ctse%401; BDICON=10123156; __bsi=10758836066815561144_00_10_R_R_7_0303_c02f_Y'
    cookie3 = 'BIDUPSID=FE794593CC0286003FE60E85C62AE0B1; PSTM=1581333675; BAIDUID=FE794593CC02860052F2C230B1A52437:FG=1; delPer=0; BDUSS=FNYYlJmNGNkaVc3eGl-djVsbkNJUzR4Q0YzY34xVVlkUW83NmFTdFFVbTh5V2hlRVFBQUFBJCQAAAAAAAAAAAEAAAANZAWttPPJ8bOkyfqyu8DPAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAALw8QV68PEFec; H_PS_PSSID=1439_21116_18560_26350; BDORZ=AE84CDB3A529C0F8A2B9DCDD1D18B695; SE_LAUNCH=5%3A26355561_0%3A26355561; __bsi=11493453298826831208_00_17_N_R_0_0303_c02f_Y; H_WISE_SIDS=128699_141845_141001_140109_135847_141753_139057_140201_140592_136862_138585_141651_138253_140113_140325_140579_133847_140792_140065_134046_131423_141708_107319_139884_140995_140966_137703_110085_138104_140854_138878_137979_141199_140174_131246_137743_138165_138883_127969_140593_138425_138941_140684_141191; rsv_i=def7CHH5bNa1P%2FM5grcxW8bBXQ0h9NDHKFtvthLPhXrGW85sHE%2BOKxOeBwoWQVN4jz4e8h5XVM5sMpg3%2BUVRqnYtlo%2BEaOg; BDSVRTM=371; plus_lsv=f197ee21ffd230fd; plus_cv=1::m:49a3f4a6; Hm_lvt_12423ecbc0e2ca965d84259063d35238=1581333718; Hm_lpvt_12423ecbc0e2ca965d84259063d35238=1581333743; BDSVRBFE=Go; wise_tj_ub=ci%40-1_-1_-1_-1_-1_-1_-1%7Ciq%403_2_3_264%7Ccb%40-1_-1_-1_-1_-1_-1_-1%7Cce%401%7Ctse%401; BDICON=10123156'
    cookies = [cookie3,cookie2,cookie1]
    q,group_list = bdmoIndexMonitor.read_excel('2020xiaoqu_kwd_city_1.xlsx')  # 关键词队列及分类
    result = bdmoIndexMonitor.result_init(group_list)  # 初始化结果
    all_num = q.qsize() # 总词数
    f = open('{0}bdmo1_index_info.txt'.format(today),'w',encoding="utf-8")
    f_all = open('{0}bdmo1_index_all.txt'.format(today),'w',encoding="utf-8")
    file_path = f.name
    # 设置线程数
    for i in list(range(1)):
        t = bdmoIndexMonitor()
        t.setDaemon(True)
        t.start()
    q.join()
    f.close()
    f_all.close()
    # 根据bdmo1_index_info.txt计算结果
    result_last = get_result(file_path,result)
    # 写入txt文件
    write_domains_txt(result_last)
    # 写入excel
    write_myexcel(group_list,result_last,today,my_domain)
    end = time.time()
    print('关键词共{0}个,耗时{1}min'.format(all_num, (end - start) / 60))

测试查询3.6万个小区词,筛选前20的域名出来瞅瞅。

m.baidu.com 15.55%
34689.recommend_list.baidu.com 9.48%
m.anjuke.com 8.87%
baike.baidu.com 8.03%
m.lianjia.com 7.05%
m.fang.com 5.68%
m.5i5j.com 4.95%
mobile.anjuke.com 3.63%
m.focus.cn 3.08%
m.58.com 2.59%
nourl.ubs.baidu.com 2.03%
mpoi.mapbar.com 1.84%
m.ke.com 1.52%
zhidao.baidu.com 1.36%
map.baidu.com 1.13%
m.jiwu.com 1.00%
m.dianping.com 0.76%
m.city8.com 0.60%
m.loupan.com 0.59%
www.anjuke.com 0.55%

附bdmo1_tj.py脚本

# ‐*‐ coding: utf‐8 ‐*‐
"""
根据bdmo1_index_all.txt数据统计域名覆盖率
sigma.baidu.com:xx_相关网站|xx_相关企业
recommend_list.baidu.com:其他人还在搜
nourl.ubs.baidu.com:搜索智能聚合
bzclk.baidu.com:结构化的展示样式
"""

import requests
from pyquery import PyQuery as pq
import threading
import queue
import time
from urllib.parse import urlparse
import gc
import json
from openpyxl import load_workbook
from openpyxl import Workbook


# 提取某条url域名部分
def get_domain(real_url):

    # 通过mu提取url有些非自然排名url是空
    try:
       res = urlparse(real_url)  # real_url为空不会报错
    except Exception as e:
       print (e,real_url)
       domain = "xxx"
    else:
       domain = res.netloc
    return domain


# 读取文件获取关键词类别
def read_excel(filepath):
    city_list = []
    wb_kwd = load_workbook(filepath)
    for sheet_obj in wb_kwd:
        sheet_name = sheet_obj.title
        city_list.append(sheet_name)
    return city_list

# 初始化结果字典
def result_init(group_list):
    result = {}
    for group in group_list:
        result[group] = {}
    print("结果字典init...")
    return result

def save():
    res_format = result.items()
    #写入excel文件
    wb = Workbook()
    # 创建sheet
    for city in city_list:
        sheet_num = 0
        wb.create_sheet(u'{0}'.format(city),index=sheet_num)
        sheet_num += 1
    for city,data_dict in res_format:
        sort_dict = sorted(data_dict.items(), key=lambda s: s[1], reverse=True)
        for domain,num in sort_dict:
            row_value = [domain,num]
            wb[u'{0}'.format(city)].append(row_value)
    wb.save('{0}bdmo1_index_cover.xlsx'.format(today))

    # 写入txt
    res_format = sorted(result_all.items(), key=lambda s: s[1], reverse=True)
    with open('{0}bdmo1_domain_res.txt'.format(today),'w',encoding='utf-8') as f:
        for domain,num in res_format:
            f.write(domain+'	'+str(num)+'	' + str('{:.2%}'.format(num/count)) + '
')


if __name__ == "__main__":
    start = time.time()
    local_time = time.localtime()
    today = time.strftime('%Y-%m-%d', local_time)
    today = '20200210'
    city_list = read_excel('2020xiaoqu_kwd_city_1.xlsx')
    result = result_init(city_list)  # 初始化结果字典
    result_all={} # 不区分城市统计

    #文件比较大统计行数
    count=-1
    for count, line in enumerate(open('{0}bdmo1_index_all.txt'.format(today),'r',encoding='utf-8')):
        count+=1
    print(count)

    # 统计每个域名出现了多少次
    for i in open('{0}bdmo1_index_all.txt'.format(today),'r',encoding='utf-8'):
        i = i.strip()
        line = i.split('	')
        url = line[1]
        city = line[4]
        style = line[3] # 特征,如rel_ugc为热议
        if url.startswith('http'):
            domain = get_domain(url)
            result[city][domain] = result[city][domain]+1 if domain in result[city] else 1
            result_all[domain] = result_all[domain]+1 if domain in result_all else 1
        else:
            result[city][style] = result[city][style]+1 if style in result[city] else 1
            result_all[style] = result_all[style]+1 if style in result_all else 1
    # 结果保存文件
    save()

    end = time.time()