国内最大的小游戏网站是哪个?如果要山寨的话,做这个要先调研,这回只山寨一个就好,节约时间。
4399.com Alexa网站流量排名: 289 CN的流量排名: 62 反向链接2,053
xiaoyouxi.com Alexa网站流量排名: 43,110 CN的流量排名: 8,149 反向链接:1,999
7k7k.com Alexa网站流量排名: 712 CN的流量排名: 121 反向链接:1,259
搜了几下,搜到一个排名,中国在线Flash第一的小游戏网站觉得非4399莫属了,今天就山寨4399吧。 今天下载它的所有flash试试。下面是两年前写的代码,现在看起来写的很幼稚,那时正则表达式还不太熟练,还在用SGMParser这样的东西,代码里 还有大量的+号,也不够pythonic,思维到现在还是过程式编程。总之,下面贴出来的不能用了,网站url都变了,今天又要重写了。
# -- coding: utf-8 --
#coding = utf-8
# Filename: getflash.py
#author = yobin
#2008.3
import re,os,sys
import urllib,socket
from sgmllib import SGMLParser
from threadpool import WorkerManager
#(图片) 名字 类别 试玩 (链接)大小 速度 描述 (来源)
result_4399 = ‘’
result_xiaoyouxi = ‘’
result_7k7k = ‘’
result_yx007 = ‘’
result_gamecomcn = ‘’
result_3839 = ‘’
type = sys.getfilesystemencoding()
###==================================================================
def get_url_data(url):
nFail = 0
while nFail < 5:
try:
sock = urllib.urlopen(url)
htmlSource = sock.read()
sock.close()
return htmlSource
except:
nFail += 1
print "get url fail:%s" % (url)
return None
class classLister_4399(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.bScope = 0
self.urls = []
self.titles = []
self.imgs = []
self.pages = 0
self.total = 0
def start_span(self, attrs):
href = [v for k, v in attrs if k==‘class’]
if href:
if href[0] == "style04" and self.bScope == 0:
self.bScope = 1
def start_img(self, attrs):
if self.bScope == 1:
imgsrc = [v for k, v in attrs if k==‘src’]
if imgsrc:
if imgsrc[0] == ‘/images/xy.gif’:
self.bScope = 3
else:
self.imgs.extend(imgsrc)
self.bScope = 2
def start_a(self, attrs):
if self.bScope == 2:
myclass = [v for k, v in attrs if k==‘class’]
if myclass:
if myclass[0] == ‘color6’:
href = [v for k, v in attrs if k==‘href’]
if href:
self.urls.extend(href)
if self.bScope == 3:
href = [v for k, v in attrs if k==‘href’]
if href:
self.bScope = 0
text = href[0].replace(’.htm’,’’)
self.pages = int(text.split(’_’)[-1])
def handle_data(self, text):
if self.bScope == 2:
self.bScope = 1
self.titles.append(text)
###==================================================================
def get_swfurl_4399(pageurl):
htmlContent = get_url_data(pageurl).replace(’</script>’,’’)
htmlContent = htmlContent.replace(’<br/>’,’’)
rawstr = r’<!DOCTYPE.var str\d+ = "’
compile_obj = re.compile(rawstr, re.DOTALL)
matchstr = htmlContent
CleanPageContent = compile_obj.subn(’’, matchstr)
rawstr = r"<iframe.</HTML>"
compile_obj = re.compile(rawstr, re.DOTALL)
matchstr = CleanPageContent[0]
CleanPageContent = compile_obj.subn(’’, matchstr)
return CleanPageContent[0]
###==================================================================
def download_swf_4399(url,path,title = None,img = None,des = None,size = ‘Unknown’):
global result_4399
url = ‘http://www.4399.com’ + url
mystring = get_swfurl_4399(url)
myset = mystring.split(’";’)
swfurl = myset[0]
swfurl = ‘http://www.gg173.net:8080/4399swf' + swfurl
print swfurl
des = myset[-1]
des = des.replace(’ ‘,’’)
des = des.replace(’ ’,’’)
temp = "%s %s %s %s %s %s" % (path,title,swfurl,img,des,size)
temp = temp.replace(’\n’,’’)
temp = temp.replace(’\r’,’’)
result_4399 += temp + ‘\n’
# swfurl = ‘http://www.gg173.net:8080/4399swf' + get_swfurl_4399(url)
# result_4399 += "%s %s %s\n" % (path,title,swfurl)
return None
###==================================================================
def save_result_4399():
global result_4399
f = open(‘4399.dat’,"w")
f.write(result_4399)
f = f.close()
result_4399 = ‘’
###==================================================================
def parse_4399():
print "parse_4399() "
if os.path.exists(‘4399.dat’):
#return
pass
socket.setdefaulttimeout(20)
path = ‘’
wm = WorkerManager(200)
if os.path.exists(‘4399_url.dat’):
rFile = open(‘4399_url.dat’, ‘r’)
lines = rFile.readlines()
rFile.close()
loop = 0
for indexurl in lines:
if len(indexurl) < 4:
continue
loop += 1
if loop == 10 or loop == 15:
loop += 1
path = ‘%d’ % (loop)
indexurl = indexurl.replace(’\n’,’’)
print indexurl
htmlContent = get_url_data(indexurl)
if htmlContent == None:
continue
htmlContent = htmlContent.replace(’<br/>’,’’)
parser = classLister_4399()
parser.feed(htmlContent)
parser.close()
for index,url in enumerate(parser.urls):
wm.add_job(download_swf_4399,url,path,parser.titles[index],parser.imgs[index],None,‘Unknown’)
wm.start()
wm.wait_for_complete()
save_result_4399()
return
for loop in range(1,17):
if loop == 10 or loop == 15:
continue
path = "%d" % (loop)
page = 0
maxpage = 0
indexurl = r’http://www.4399.com/flash_fl/%d_1.htm' % (loop)
while 1:
print indexurl
f = open(‘4399_url.dat’,"a")
string = indexurl + ‘\n’
f.write(string)
f = f.close()
htmlContent = get_url_data(indexurl)
if htmlContent == None:
continue
htmlContent = htmlContent.replace(’<br/>’,’’)
parser = classLister_4399()
parser.feed(htmlContent)
parser.close()
for index,url in enumerate(parser.urls):
wm.add_job(download_swf_4399,url,path,parser.titles[index],parser.imgs[index],None,‘Unknown’)
if maxpage == 0:
maxpage = parser.pages
if page >= maxpage:
break
page += 1
indexurl = r’http://www.4399.com/flash_fl/%d_%d.htm' % (loop,page)
wm.start()
wm.wait_for_complete()
save_result_4399()
def get_swfurl_gamecomcn(pageurl):
htmlContent = get_url_data(pageurl)
rawstr = r"<!DOCTYPE.fullplay.html?"
compile_obj = re.compile(rawstr, re.DOTALL)
matchstr = htmlContent
CleanPageContent = compile_obj.subn(’’, matchstr)
rawstr = r"’,.</html>"
compile_obj = re.compile(rawstr, re.DOTALL)
matchstr = CleanPageContent[0]
CleanPageContent = compile_obj.subn(’’, matchstr)
return CleanPageContent[0]
###==================================================================
def get_allwebgame():
parse_4399()
#parse_xiaoyouxi()
#parse_7k7k()
#parse_yx007()
#parse_gamecomcn()
#parse_3839()
#http://www.yx8.com/
if name == "main":
get_allwebgame()
山寨游戏网站之调研 [draft]
山寨游戏网站之调研
...