引言

关于Scrapy的相关介绍及豆瓣案例请看我写的另外两篇博客。
http://blog.csdn.net/qy20115549/article/details/52528896
http://blog.csdn.net/qy20115549/article/details/52575291

待爬的url

如下图所示，所需要爬去的url地址，有很多，存储在txt文本文件中，如其中的一个链接为//stock.qq.com/a/20160919/007925.htm。

这里写图片描述

框架架构

这里写图片描述

items的编写

为了简单起见，我只爬了新闻的标题及正文。如下图所示：

这里写图片描述

__author__ = ' HeFei University of Technology Qian Yang email：1563178220@qq.com'
# -*- coding: utf-8 -*-
import scrapy
class News(scrapy.Item):
    content = scrapy.Field()
    title = scrapy.Field()

Spider的编写

__author__ = ' HeFei University of Technology Qian Yang email：1563178220@qq.com'
# -*- coding:utf-8 -*-
import scrapy
from tengxunnews.items import News
class Teng(scrapy.Spider):
  name = 'tengxunnews'
  allowed_domains = ["qq.com"]
  #read url from file 
  f = open("E:\\a.txt", "r")
  start_urls = []
  while True:
      line = f.readline()
      if line:
          pass    # do something here
          line=line.strip().replace("['","").replace("']","")
          p=line.rfind('.')
          filename=line[0:p]
          print "the url is %s"%line
          start_urls.append(line)
      else:
          break
  f.close()
  def parse(self, response):
    item = News()
    item['content'] = response.xpath('//div[@id="Cnt-Main-Article-QQ"]/p/text()').extract()
    item['title'] = response.xpath('//div[@class="hd"]/h1/text()').extract()
    yield item

存储pipelines的编写

__author__ = ' HeFei University of Technology Qian Yang email：1563178220@qq.com'
# -*- coding: utf-8 -*-
import json
import codecs
#以Json的形式存储
class JsonWithEncodingCnblogsPipeline(object):
    def __init__(self):
        self.file = codecs.open('tengxunnews.json', 'w', encoding='gbk')
    def process_item(self, item, spider):
        line = json.dumps(dict(item), ensure_ascii=False) + "\n"
        self.file.write(line)
        return item
    def spider_closed(self, spider):
        self.file.close()
#将数据存储到mysql数据库
from twisted.enterprise import adbapi
import MySQLdb
import MySQLdb.cursors
class MySQLStorePipeline(object):
    #数据库参数
    def __init__(self):
        dbargs = dict(
             host = '127.0.0.1',
             db = 'test',
             user = 'root',
             passwd = '112233',
             cursorclass = MySQLdb.cursors.DictCursor,
             charset = 'utf8',
             use_unicode = True
            )
        self.dbpool = adbapi.ConnectionPool('MySQLdb',**dbargs)
    ''' The default pipeline invoke function '''
    def process_item(self, item,spider):
        res = self.dbpool.runInteraction(self.insert_into_table,item)
        return item
    #插入的表，此表需要事先建好
    def insert_into_table(self,conn,item):
            conn.execute('insert into tengxunnews(content, title) values(%s,%s)', (
                item['content'][0],
                item['title'][0])
                )

main方法的编写

__author__ = ' HeFei University of Technology Qian Yang email：1563178220@qq.com'
from scrapy import cmdline
cmdline.execute("scrapy crawl tengxunnews".split())