1.利用casperjs 爬取新浪股市排行数据,生成数据文件
//获取新浪股票排行var casper = require('casper').create({ waitTimeout: 10000, verbose:true, logLevel: "debug", clientScripts: ["../jquery-3.1.1.min.js"], #Casperjs的运行需要jquery pageSettings: { loadImages: false, loadPlugins: false, userAgent: 'Mozilla/5.0 (Windows NT 6.1; rv:17.0) Gecko/20100101 Firefox/17.0', },}); casper.start(); // 2. 打开新浪股票热度排行页面 casper.thenOpen('http://touzi.sina.com.cn/public/bhot'); // 3.输出全市场用户关注1日变化幅度排行 casper.then(function getrank() { rank = casper.evaluate(function () { var trlist = []; $("#allday1").find('tr:gt(0)').each(function(){ var tdlist = []; $(this).find("td").each(function() { tdlist.push($(this).text().trim()); }); tdlist.push('\n'); trlist.push(tdlist); }); return trlist; }); var filename = 'data/allday1_sina.txt'; var fs = require('fs'); fs.write(filename,rank);}); // 4.输出全市场用户关注5日变化幅度排行 casper.then(function getrank() { rank = casper.evaluate(function () { var trlist = []; $("#allday5").find('tr:gt(0)').each(function(){ var tdlist = []; $(this).find("td").each(function() { tdlist.push($(this).text().trim()); }); tdlist.push('\n'); trlist.push(tdlist); }); return trlist; }); var filename = 'data/allday5_sina.txt'; var fs = require('fs'); fs.write(filename,rank);}); casper.run();
2.使用 python入库 读取Casperjs生成的数据文件,写入mysql
#!/usr/bin/evn python# -*- coding:utf-8 -*-import MySQLdbimport os,sys,subprocessclass Spider(): def __init__(self, filename, js_filename, table): self.url = url self.filename = filename self.table = table def read_file_data(self): value_sets = [] try: with open(self.filename, 'rb') as data: count = 0 for line in data: value = line.decode("gb2312").encode("utf-8").split() value_sets.append(value) count += 1 return value_sets, count except Exception,e: print Exception,":",e def get_insert_sql(self): """ 获取table字段list,返回插入数据sql语句 """ try: cursor = db.cursor() cursor.execute("select * from %s limit 1" % self.table) field_name_list = [each[0] for each in cursor.description] del field_name_list[0] #去除自增id字段名 column_list = "(" + ",".join([field for field in field_name_list]) + ")" values_format = "values(" + ("%s,"*len(field_name_list)).rstrip(',') + ")" insert_sql = "INSERT INTO %s" % self.table+column_list + values_format return insert_sql except Exception, e: print("Error: %s" % e) cursor.close() db.close() def save(self): value_sets, count = self.get_file_data() insert_sql = self.get_insert_sql() if not (value_set and count): print "get data fom file failed" if not insert_sql: print "get insert_sql failed" try: cursor = db.cursor() cursor.executemany(insert_sql, value_sets) db.commit() print (u"成功插入数据%d条" % count) except Exception, e: db.rollback() print Exception, ":", e print (u"插入数据失败,数据回滚") cursor.close() db.close()