[問題] 證交所集保庫存的特殊字元抓取問題。
小弟我想要抓取證交所集保庫存的資料然後Print出來,使用的Python3.5,
目前發生的問題是在抓取台積電或是台泥等都沒有問題,但是假如是一些特
殊的公司如"美吉吉-KY"就會發生編碼問題,而我在以下程式碼中
Tdcc.parse(webcode.read().decode('cp950')) 也使用過utf-8
基本上都還是會出問題,請問到底是編碼的問題?還是是code本身有誤?
還請各位賜教,以下是我的程式碼(假如要改看台積電就把url裡面的8466改成2330就好)
:
# -*- coding: utf8 -*-
# coding: utf8
import urllib.request, urllib.parse, urllib.error, codecs
from html.parser import HTMLParser
def TdccData(URL):
webcode = urllib.request.urlopen(URL) #解析網頁開始
if webcode.code == 200:
Tdcc = ParseWebData()
#Tdcc.parse(webcode.read().decode('cp950'))
Tdcc.parse(webcode.read().decode('utf-8-sig'))
#Tdcc.parse(webcode.read())
Tdcc.close()
if len(Tdcc.cell) > 0 :
return Tdcc.cell
else:
return []
class ParseWebData(HTMLParser):
def __init__(self): #初始化class等同constructor
HTMLParser.__init__(self)
def reset(self): #初始化變數數值
HTMLParser.reset(self)
self.headname = False
self.center = False
self.right = False
self.cell = []
self.centercount = 0
self.rightcount = 0
def parse(self,data): #解析網頁
self.feed(data)
self.close()
def handle_starttag(self, td, attrs):
for name,value in attrs:
if name =='class' and value == 'wuc9':
self.headname = True
if name =='align' and value == 'center':
self.center = True
self.centercount+=1
if name =='align' and value == 'right':
self.right = True
self.rightcount+=1
def handle_data(self, text): #開始讀取集保資料到暫存list中
if self.headname:
self.headname = False #print text
if self.center:
if self.centercount == 1:
self.center = False #print "center : " + text
if self.centercount == 2:
self.center = False #print "center : " + text
self.centercount = 0
if self.right :
if self.rightcount == 1:
self.cell.append(text.strip().replace(",", "")) #print "right : " + text
self.right = False
if self.rightcount == 2:
self.cell.append(text.strip().replace(",", "")) #print "right : " + text
self.right = False
if self.rightcount == 3:
self.cell.append(text.strip()) #print "right : " + text
self.rightcount = 0
self.right = False
def main():
url =
"http://www.tdcc.com.tw/smWeb/QryStock.jsp?SCA_DATE=20170113&SqlMethod=StockNo&StockNo=8466&StockName=&sub=%ACd%B8%DF"
cell = TdccData(url)
print(cell)
if __name__ == "__main__":
main()
--
※ 發信站: 批踢踢實業坊(ptt.cc), 來自: 59.124.166.19
※ 文章網址: https://www.ptt.cc/bbs/Python/M.1484637353.A.4B3.html
推
01/17 15:49, , 1F
01/17 15:49, 1F
→
01/17 15:49, , 2F
01/17 15:49, 2F
→
01/17 16:00, , 3F
01/17 16:00, 3F
Python 近期熱門文章
PTT數位生活區 即時熱門文章