Cyrus Blog

FLAG{S0_H4PPY_C_U_H3R3} (>.<)

爬虫入门 0x00 单页面抓取文件

本文共 339 字,预计阅读时间 1 分钟。

抓取一个页面上指定类型的文件并保存。
最后一版Python2.7写的代码,之后就开始用3.6了。
最后一版不会用re写的代码,之后……之后大概也没怎么学吧。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54

# Author = Cyrus# Py Edition = 2.7import requests
import os

os.system(&quot;cls&quot;)
global success
success=0
global error
error=0

def download(url,filename):
global success
global error
s=requests.get(url,stream=True)
if (s.status_code==200):
with open(filename, &#39;wb&#39;) as f:
for chunk in s:
f.write(chunk)
print &#39; - Sucess&#39;
success+=1
else:
print &#39; - Error&#39;
error+=1

def find(url,a,path):
print &quot;******************************************&quot;
print &quot;URL = &quot;,url
print &quot;FILETYPE = &quot;,a
s=requests.post(url=url).content
while (s.partition(a)[1]!=&quot;&quot;):
r1=url+s.partition(a)[0]
while (r1.partition(&quot;\&quot;&quot;)[1]!=&quot;&quot;):
r1=r1.partition(&quot;\&quot;&quot;)[2]
filename=r1
while (filename.partition(&quot;/&quot;)[1]!=&quot;&quot;):
filename=filename.partition(&quot;/&quot;)[2]
filename=path+filename+a
if (r1.partition(&quot;http://&quot;)[1]!=&quot;&quot;):
r1=r1+a
else:
r1=url+r1+a
print r1,
download(r1,filename)
s=s.partition(a)[2]
print &quot;&quot;

url=&quot;http://www.uestc.edu.cn/&quot;
find(url,&quot;.jpg&quot;,&quot;&quot;)
find(url,&quot;.png&quot;,&quot;&quot;)
find(url,&quot;.html&quot;,&quot;&quot;)

print &quot;******************************************&quot;
print &quot;Success download &quot;,success,&quot; file(s),&quot;
print &quot;Failed download &quot;,error,&quot; file(s).&quot;

【运行】