Python采集百度地图数据

Python   2012-03-11 14:50:44 发布
您的评价:
     
3.5
收藏     3收藏
文件夹
标签
(多个标签用逗号分隔)
百度利用其强大的中文搜索引擎数据,结合地图应用,包含了海量的公司联系方式,比Google要强,更别说什么黄页网站了。
因为一些业务需要,写了这个行业公司地址采集程序,使用方便,直接运行,支持命令行设定查询参数。

使用方法:
把代码保存成bmap.py
python bmap.py

python bmap.py 服饰厂

运行后会自动采集百度地图中所有的结果,保存为以tab分割的txt文件,方便导入各种数据库。 10164653_enfn.png
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright 2012 Channing Wong
#
# @mail: channing.wong@yahoo.com
# @home: http://blog.3363.me/
# @date: Mar 3, 2012
#

import json
import sys
import time
import types
import urllib

reload(sys)
sys.setdefaultencoding('utf-8')


class BaiduMap:
    """
    """
    def __init__(self, keyword):
        self.keyword = keyword
        self.query = [
                ('b', '(-1599062.039999999,811604.75;24779177.96,8168020.75)'),
                ('c', '1'),
                ('from', 'webmap'),
                ('ie', 'utf-8'),
                ('l', '4'),
                ('newmap', '1'),
                ('qt', 's'),
                ('src', '0'),
                ('sug', '0'),
                ('t', time.time().__int__()),
                ('tn', 'B_NORMAL_MAP'),
                ('wd', keyword),
                ('wd2', '')
                 ]
        self.mapurl = 'http://map.baidu.com/'
        self.file = open('%s.txt' % keyword, 'w')
        self.count = 0
        self.count_c = 0
        self.total_num = 0

        self._get_city()

    def _fetch(self, query=None, json=True):
        data = urllib.urlencode(query)
        url = self.mapurl + '?' + data
        opener = urllib.FancyURLopener()
        data = opener.open(url).read()

        if json:
            return self._tojson(data)
        else:
            return data

    def _tojson(self, data):
        try:
            js = json.loads(data, 'utf-8')
        except:
            js = None

        return js

    def _get_city(self):
        data = self._fetch(self.query)

        if type(data['content']) is not types.ListType:
            print 'keyworld error.'
            sys.exit()

        self.city = data['content']

        if data.has_key('more_city'):
            for c in data['more_city']:
                self.city.extend(c['city'])

        for city in self.city:
            self.total_num += city['num']

    def _get_data(self, city, page=0):
        query = [
                ('addr', '0'),
                ('b', '(%s)' % city['geo'].split('|')[1]),
                ('c', city['code']),
                ('db', '0'),
                ('gr', '3'),
                ('ie', 'utf-8'),
                ('l', '9'),
                ('newmap', '1'),
                ('on_gel', '1'),
                ('pn', page),
                ('qt', 'con'),
                ('src', '7'),
                ('sug', '0'),
                ('t', time.time().__int__()),
                ('tn', 'B_NORMAL_MAP'),
                ('wd', self.keyword),
                ('wd2', ''),
                 ]
        data = self._fetch(query)
        return data

    def _save(self, content, city):
        for c in content:
            self.count += 1
            self.count_c += 1
            if c.has_key('tel'):
                tel = c['tel']
            else:
                tel = ''

            _data = '%s\t%s\t%s\t%s\n' % (city['name'], c['name'], c['addr'], tel)
            self.file.write(_data)
            print '(%s/%s) %s[%s/%s]' % (self.count, self.total_num, city['name'], self.count_c, city['num'])

    def get(self, city):
        self.count_c = 0
        pages = abs(-city['num'] / 10)
        for page in range(0, pages):
            data = self._get_data(city, page)
            if data.has_key('content'):
                self._save(data['content'], city)

    def get_all(self):
        for city in self.city:
            self.get(city)

        self.file.close()


if __name__ == '__main__':
    if sys.argv.__len__() > 1:
        keyword = sys.argv[1]
    else:
        keyword = '钻石'

    baidumap = BaiduMap(keyword)
    print '_' * 20
    print 'CITY: %s' % baidumap.city.__len__()
    print 'DATA: %s' % baidumap.total_num
    baidumap.get_all()

扩展阅读

史上最全Python数据分析学习路径图
baidupan:百度盘的Python SDK
近200篇机器学习&深度学习资料分享(含各种文档,视频,源码等)
机器学习与深度学习资料
机器学习(Machine Learning)&深度学习(Deep Learning)资料

为您推荐

“最美天气”Python抓取天气
PostgreSQL 数据库的简单操作PHP类
flask权限管理
计算机会议最佳论文整理
【译】使用 AngularJS 和 Electron 构建桌面应用

更多

Python
Python开发
相关文档  — 更多
相关经验  — 更多
相关讨论  — 更多