-
Notifications
You must be signed in to change notification settings - Fork 30
/
Copy pathcraw_dblp.py
79 lines (69 loc) · 3.14 KB
/
craw_dblp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import urllib.request
import re
import os
import xlwt
def get_pdf_arxiv(web_site, path):
rep = urllib.request.urlopen(urllib.request.Request(web_site))
page = rep.read().decode('utf-8')
pdf_download = re.findall('<meta name="citation_pdf_url" content="(.*?)"/>', page, re.S) # 查询到网页中对应的pdf下载链接
print(pdf_download[0])
if (len(pdf_download) != 0):
try:
u = urllib.request.urlopen(pdf_download[0])
except urllib.error.HTTPError:
print(pdf_download[0], "url file not found")
return
block_sz = 8192
with open(path, 'wb') as f:
while True:
buffer = u.read(block_sz)
if buffer:
f.write(buffer)
else:
break
print("Sucessful to download " + path)
# 创建一个Excel文件
file = xlwt.Workbook()
# 创建sheet工作表
sheet1 = file.add_sheet(u'表1', cell_overwrite_ok=True)
for j in range(10):
req = urllib.request.Request(
'https://dblp.uni-trier.de/search//publ/inc?q=Community%20Detection&s=ydvspc&h=30&b=' + str(
j)) # 此处只需要修改q=后面的内容,并且保留&s=ydvspc之后的内容
response = urllib.request.urlopen(req)
the_page = response.read().decode('utf-8')
paper_title = re.findall('<span class="title" itemprop="name">(.*?)</span>', the_page, re.S) # 检索页面中给的论文名字
paper_pub = re.findall('<span itemprop="name">(.*?)</span>', the_page, re.S) # 检索出版商
paper_year = re.findall('<span itemprop="datePublished">(.*?)</span>', the_page, re.S) # 检索出版年份
# paper_web = re.findall('view</b></p><ul><li class="ee"><a href="(.*?)" itemprop="url">', the_page, re.S) # 存储文章对应网址
for i in range(len(paper_title)):
# 以下代码去除了原有网页论文名字中无法保存为window文件名的符号,采用下划线_代替
paper_title[i] = paper_title[i].replace('"', '')
list = paper_title[i].split(" ")
paper_title[i] = "_".join(list)
list = paper_title[i].split(":")
paper_title[i] = "_".join(list)
list = paper_title[i].split("?")
paper_title[i] = "_".join(list)
list = paper_title[i].split("<sub>")
paper_title[i] = "_".join(list)
list = paper_title[i].split("</sub>")
paper_title[i] = "_".join(list)
list = paper_title[i].split("<sup>")
paper_title[i] = "_".join(list)
list = paper_title[i].split("</sup>")
paper_title[i] = "_".join(list)
print(paper_title[i])
# path_dir = "D:\\APTX-4869\\md\\机器学习\\社区网络\\论文\\"
# 写入数据从 0 开始计数
sheet1.write(i + j*30, 0, paper_title[i])
if i < len(paper_pub):
sheet1.write(i + j*30, 1, paper_pub[i])
if i < len(paper_year):
sheet1.write(i + j*30, 2, paper_year[i])
# dir_list = os.listdir(path_dir)
# path = paper_title[i] + "pdf"
# if path not in dir_list:
# get_pdf_arxiv(paper_web[i], path_dir + path)
# 保存文件
file.save(f"论文.xls")