Python采集百度地图数据

jopen 12年前

百度利用其强大的中文搜索引擎数据,结合地图应用,包含了海量的公司联系方式,比Google要强,更别说什么黄页网站了。
因为一些业务需要,写了这个行业公司地址采集程序,使用方便,直接运行,支持命令行设定查询参数。

使用方法:
把代码保存成bmap.py
python bmap.py

python bmap.py 服饰厂

运行后会自动采集百度地图中所有的结果,保存为以tab分割的txt文件,方便导入各种数据库。 10164653_enfn.png

#!/usr/bin/env python  # -*- coding: utf-8 -*-  #  # Copyright 2012 Channing Wong  #  # @mail: channing.wong@yahoo.com  # @home: http://blog.3363.me/  # @date: Mar 3, 2012  #    import json  import sys  import time  import types  import urllib    reload(sys)  sys.setdefaultencoding('utf-8')      class BaiduMap:      """      """      def __init__(self, keyword):          self.keyword = keyword          self.query = [                  ('b', '(-1599062.039999999,811604.75;24779177.96,8168020.75)'),                  ('c', '1'),                  ('from', 'webmap'),                  ('ie', 'utf-8'),                  ('l', '4'),                  ('newmap', '1'),                  ('qt', 's'),                  ('src', '0'),                  ('sug', '0'),                  ('t', time.time().__int__()),                  ('tn', 'B_NORMAL_MAP'),                  ('wd', keyword),                  ('wd2', '')                   ]          self.mapurl = 'http://map.baidu.com/'          self.file = open('%s.txt' % keyword, 'w')          self.count = 0          self.count_c = 0          self.total_num = 0            self._get_city()        def _fetch(self, query=None, json=True):          data = urllib.urlencode(query)          url = self.mapurl + '?' + data          opener = urllib.FancyURLopener()          data = opener.open(url).read()            if json:              return self._tojson(data)          else:              return data        def _tojson(self, data):          try:              js = json.loads(data, 'utf-8')          except:              js = None            return js        def _get_city(self):          data = self._fetch(self.query)            if type(data['content']) is not types.ListType:              print 'keyworld error.'              sys.exit()            self.city = data['content']            if data.has_key('more_city'):              for c in data['more_city']:                  self.city.extend(c['city'])            for city in self.city:              self.total_num += city['num']        def _get_data(self, city, page=0):          query = [                  ('addr', '0'),                  ('b', '(%s)' % city['geo'].split('|')[1]),                  ('c', city['code']),                  ('db', '0'),                  ('gr', '3'),                  ('ie', 'utf-8'),                  ('l', '9'),                  ('newmap', '1'),                  ('on_gel', '1'),                  ('pn', page),                  ('qt', 'con'),                  ('src', '7'),                  ('sug', '0'),                  ('t', time.time().__int__()),                  ('tn', 'B_NORMAL_MAP'),                  ('wd', self.keyword),                  ('wd2', ''),                   ]          data = self._fetch(query)          return data        def _save(self, content, city):          for c in content:              self.count += 1              self.count_c += 1              if c.has_key('tel'):                  tel = c['tel']              else:                  tel = ''                _data = '%s\t%s\t%s\t%s\n' % (city['name'], c['name'], c['addr'], tel)              self.file.write(_data)              print '(%s/%s) %s[%s/%s]' % (self.count, self.total_num, city['name'], self.count_c, city['num'])        def get(self, city):          self.count_c = 0          pages = abs(-city['num'] / 10)          for page in range(0, pages):              data = self._get_data(city, page)              if data.has_key('content'):                  self._save(data['content'], city)        def get_all(self):          for city in self.city:              self.get(city)            self.file.close()      if __name__ == '__main__':      if sys.argv.__len__() > 1:          keyword = sys.argv[1]      else:          keyword = '钻石'        baidumap = BaiduMap(keyword)      print '_' * 20      print 'CITY: %s' % baidumap.city.__len__()      print 'DATA: %s' % baidumap.total_num      baidumap.get_all()