国内最大的小游戏网站是哪个?如果要山寨的话,做这个要先调研,这回只山寨一个就好,节约时间。
4399.com  Alexa网站流量排名: 289  CN的流量排名: 62 反向链接2,053
xiaoyouxi.com Alexa网站流量排名: 43,110  CN的流量排名: 8,149 反向链接:1,999
7k7k.com Alexa网站流量排名: 712  CN的流量排名: 121 反向链接:1,259

搜了几下,搜到一个排名,中国在线Flash第一的小游戏网站觉得非4399莫属了,今天就山寨4399吧。 今天下载它的所有flash试试。下面是两年前写的代码,现在看起来写的很幼稚,那时正则表达式还不太熟练,还在用SGMParser这样的东西,代码里 还有大量的+号,也不够pythonic,思维到现在还是过程式编程。总之,下面贴出来的不能用了,网站url都变了,今天又要重写了。

# -- coding: utf-8 --
#coding = utf-8
# Filename: getflash.py
#author = yobin
#2008.3


import re,os,sys
import urllib,socket
from sgmllib import SGMLParser
from threadpool import WorkerManager

#(图片) 名字 类别 试玩 (链接)大小 速度 描述 (来源)

result_4399 = ‘’
result_xiaoyouxi = ‘’
result_7k7k = ‘’
result_yx007 = ‘’
result_gamecomcn = ‘’
result_3839 = ‘’

type = sys.getfilesystemencoding()

###==================================================================
def get_url_data(url):
nFail = 0
while nFail < 5:
try:
sock = urllib.urlopen(url)
htmlSource = sock.read()
sock.close()
return htmlSource
except:
nFail += 1
print "get url fail:%s" % (url)
return None


class classLister_4399(SGMLParser):
def reset(self):         
SGMLParser.reset(self)
self.bScope = 0
self.urls = []
self.titles = []
self.imgs = []
self.pages = 0
self.total = 0


def start_span(self, attrs):
href = [v for k, v in attrs if k==‘class’]
if href:
if href[0] == "style04" and self.bScope == 0:
self.bScope = 1

def start_img(self, attrs):
if self.bScope == 1:   
imgsrc = [v for k, v in attrs if k==‘src’]
if imgsrc:
if imgsrc[0] == ‘/images/xy.gif’:
self.bScope = 3
else:
self.imgs.extend(imgsrc)
self.bScope = 2

def start_a(self, attrs):
if self.bScope == 2:
myclass = [v for k, v in attrs if k==‘class’]
if myclass:
if myclass[0] == ‘color6’:
href = [v for k, v in attrs if k==‘href’]
if href:
self.urls.extend(href)
if self.bScope == 3:
href = [v for k, v in attrs if k==‘href’]
if href:
self.bScope = 0
text = href[0].replace(’.htm’,’’)
self.pages = int(text.split(’_’)[-1])

def handle_data(self, text):
if self.bScope == 2:
self.bScope = 1
self.titles.append(text)

###==================================================================
def get_swfurl_4399(pageurl):
htmlContent = get_url_data(pageurl).replace(’</script>’,’’)
htmlContent = htmlContent.replace(’<br/>’,’’)

rawstr = r’<!DOCTYPE.var str\d+ = "’
compile_obj = re.compile(rawstr,  re.DOTALL)
matchstr = htmlContent
CleanPageContent = compile_obj.subn(’’, matchstr)

rawstr = r"<iframe.
</HTML>"
compile_obj = re.compile(rawstr,  re.DOTALL)
matchstr = CleanPageContent[0]
CleanPageContent = compile_obj.subn(’’, matchstr)

return CleanPageContent[0]


###==================================================================
def download_swf_4399(url,path,title = None,img = None,des = None,size = ‘Unknown’):
global result_4399
url = ‘http://www.4399.com’ + url
mystring = get_swfurl_4399(url)

myset = mystring.split(’";’)
swfurl = myset[0]
swfurl = ‘http://www.gg173.net:8080/4399swf' + swfurl
print swfurl

des = myset[-1]
des = des.replace(’ ‘,’’)
des = des.replace(’&nbsp;’,’’)

temp = "%s %s %s %s %s %s" % (path,title,swfurl,img,des,size)
temp = temp.replace(’\n’,’’)
temp = temp.replace(’\r’,’’)
result_4399 += temp + ‘\n’

#    swfurl = ‘http://www.gg173.net:8080/4399swf' + get_swfurl_4399(url)
#    result_4399 += "%s %s %s\n" % (path,title,swfurl)
return None


###==================================================================
def save_result_4399():
global result_4399
f = open(‘4399.dat’,"w")
f.write(result_4399)
f = f.close()
result_4399 = ‘’

###==================================================================
def parse_4399():
print "parse_4399() "
if os.path.exists(‘4399.dat’):
#return
pass

socket.setdefaulttimeout(20)       
path = ‘’
wm = WorkerManager(200)

if os.path.exists(‘4399_url.dat’):
rFile = open(‘4399_url.dat’, ‘r’)
lines = rFile.readlines()
rFile.close()

loop = 0
for indexurl in lines:
if len(indexurl) < 4:
continue       
loop += 1
if loop == 10 or loop == 15:
loop += 1
path = ‘%d’ % (loop)

indexurl = indexurl.replace(’\n’,’’)
print indexurl

htmlContent = get_url_data(indexurl)
if htmlContent == None:
continue
htmlContent = htmlContent.replace(’<br/>’,’’)

parser = classLister_4399()
parser.feed(htmlContent)
parser.close()

for index,url in enumerate(parser.urls):
wm.add_job(download_swf_4399,url,path,parser.titles[index],parser.imgs[index],None,‘Unknown’)
wm.start()
wm.wait_for_complete()
save_result_4399()
return

for loop in range(1,17):
if loop == 10 or loop == 15:
continue

path = "%d" % (loop)
page = 0
maxpage = 0       

indexurl = r’http://www.4399.com/flash_fl/%d_1.htm' % (loop)

while 1:
print indexurl           
f = open(‘4399_url.dat’,"a")
string = indexurl + ‘\n’
f.write(string)
f = f.close()           
htmlContent = get_url_data(indexurl)
if htmlContent == None:
continue
htmlContent = htmlContent.replace(’<br/>’,’’)

parser = classLister_4399()
parser.feed(htmlContent)
parser.close()

for index,url in enumerate(parser.urls):
wm.add_job(download_swf_4399,url,path,parser.titles[index],parser.imgs[index],None,‘Unknown’)

if maxpage == 0:
maxpage = parser.pages

if page >= maxpage:
break
page += 1
indexurl = r’http://www.4399.com/flash_fl/%d_%d.htm' % (loop,page)
wm.start()
wm.wait_for_complete()
save_result_4399()

def get_swfurl_gamecomcn(pageurl):
htmlContent = get_url_data(pageurl)

rawstr = r"<!DOCTYPE.fullplay.html?"
compile_obj = re.compile(rawstr,  re.DOTALL)
matchstr = htmlContent
CleanPageContent = compile_obj.subn(’’, matchstr)

rawstr = r"’,.
</html>"
compile_obj = re.compile(rawstr,  re.DOTALL)
matchstr = CleanPageContent[0]
CleanPageContent = compile_obj.subn(’’, matchstr)

return CleanPageContent[0]

###==================================================================
def get_allwebgame():
parse_4399()
#parse_xiaoyouxi()
#parse_7k7k()
#parse_yx007()
#parse_gamecomcn()
#parse_3839()
#http://www.yx8.com/

if name == "main":
get_allwebgame()