PubMed 单篇文献基本信息获取

https://pubmed.ncbi.nlm.nih.gov/33883728/

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import requests
from lxml import etree

url = "https://pubmed.ncbi.nlm.nih.gov/33883728/"

r = requests.get(url).text
html = etree.HTML(r)

title = html.xpath('//*[@id="full-view-heading"]/h1/text()')[0].strip()
print(title)

authors = html.xpath('//*[@id="full-view-heading"]/div[2]/div/div/span/a/text()')
authors = ','.join(authors)
print(authors)

pmID = html.xpath('//*[@id="full-view-identifiers"]/li[1]/span/strong/text()')[0]
print(pmID)

mag = html.xpath('//*[@id="full-view-journal-trigger"]/text()')[0].strip()
print(mag)

info = html.xpath('//*[@id="full-view-heading"]/div[1]/div[2]/span[2]/text()')[0].split(';')
year = info[0][:4]
info = info[1]
print(info)
print(year)

abstract = html.xpath('//*[@id="eng-abstract"]/p/text()')[0].strip()
print(abstract)

try:
    kw = html.xpath('/html/body/div[5]/main/div[2]/p/text()')[1].strip()
    print(kw)
except:
    pass

PubMed 多篇文献基本信息获取

文章对应链接的获取

在搜索页中,默认为十篇,先爬取一篇文章的链接

复制 Xpath ://*[@id="search-results"]/section/div[2]/div/article[1]/div[2]/div[1]/a

根据验证的跳转文章链接为:[https://pubmed.ncbi.nlm.nih.gov/33883728/](https://pubmed.ncbi.nlm.nih.gov/33883728/),可以确定,Xpath 中的href属性是我们需要的,

那么获取链接的代码:

python
1
link = html.xpath('//*[@id="search-results"]/section/div[2]/div/article[1]/div[2]/div[1]/a/@href')

接下来就要想如何利用循环将这一页的 10 篇文章的链接获取到?

1、2、3 篇文章对应的 Xpath:

python
1
2
3
//*[@id="search-results"]/section/div[2]/div/article[1]/div[2]/div[1]/a
//*[@id="search-results"]/section/div[2]/div/article[2]/div[2]/div[1]/a
//*[@id="search-results"]/section/div[2]/div/article[3]/div[2]/div[1]/a

可以发现,发生变化的是 article[]

python
1
2
3
4
for num in range(1,11):
    link = html.xpath(f'//*[@id="search-results"]/section/div[2]/div/article[{num}]/div[2]/div[1]/a/@href')
    link = 'https://pubmed.ncbi.nlm.nih.gov' + link[0]
    print(link)

完整代码:

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import requests
from lxml import etree

#url = "https://pubmed.ncbi.nlm.nih.gov/33883728/"

def getSigleParperInfo(url):
    r = requests.get(url).text
    html = etree.HTML(r)

    title = html.xpath('//*[@id="full-view-heading"]/h1/text()')[0].strip()
    print(title)

    authors = html.xpath('//*[@id="full-view-heading"]/div[2]/div/div/span/a/text()')
    authors = ','.join(authors)
    print(authors)

    pmID = html.xpath('//*[@id="full-view-identifiers"]/li[1]/span/strong/text()')[0]
    print(pmID)

    mag = html.xpath('//*[@id="full-view-journal-trigger"]/text()')[0].strip()
    print(mag)

    info = html.xpath('//*[@id="full-view-heading"]/div[1]/div[2]/span[2]/text()')[0].split(';')
    year = info[0][:4]
    info = info[1]
    print(info)
    print(year)

    try:
        abstract = html.xpath('//*[@id="eng-abstract"]/p/text()')[0].strip()
        print(abstract)
    except:
        pass
    
    try:
        kw = html.xpath('/html/body/div[5]/main/div[2]/p/text()')[1].strip()
        print(kw)
    except:
        pass

def getLinks(url):
    r = requests.get(url).text
    html = etree.HTML(r)
    for num in range(1,11):
        link = html.xpath(f'//*[@id="search-results"]/section/div[2]/div/article[{num}]/div[2]/div[1]/a/@href')
        link = 'https://pubmed.ncbi.nlm.nih.gov' + link[0]
        print(link)
        getSigleParperInfo(link)
        print('\n\n==================\n\n')

url  = "https://pubmed.ncbi.nlm.nih.gov/?term=Atherosclerosis"
getLinks(url)

#目录页链接
#"https://pubmed.ncbi.nlm.nih.gov/?term=Atherosclerosis&page=2"
#"https://pubmed.ncbi.nlm.nih.gov/?term=Atherosclerosis&page=3"

针对遇到的一些问题,进行了修正:

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import requests
from lxml import etree

#url = "https://pubmed.ncbi.nlm.nih.gov/33883728/"

def getSigleParperInfo(url):
    r = requests.get(url).text
    html = etree.HTML(r)

    title = html.xpath('//*[@id="full-view-heading"]/h1/text()')[0].strip()
    print(title)

    authors = html.xpath('//*[@id="full-view-heading"]/div[2]/div/div/span/a/text()')
    authors = ','.join(authors)
    print(authors)

    pmID = html.xpath('//*[@id="full-view-identifiers"]/li[1]/span/strong/text()')[0]
    print(pmID)

    mag = html.xpath('//*[@id="full-view-journal-trigger"]/text()')[0].strip()
    print(mag)

    info_list = html.xpath('//*[@id="full-view-heading"]/div[1]/div[2]/span[2]/text()')
    # 先检查XPath是否返回了结果
    if info_list:
        info_text = info_list[0].strip()
        info_parts = info_text.split(';')
        year = info_parts[0][:4] if info_parts else ""
        info = info_parts[1] if len(info_parts) > 1 else ""
        print(info)
        print(year)
        
    try:
        abstract = html.xpath('//*[@id="eng-abstract"]/p/text()')[0].strip()
        print(abstract)
    except:
        pass
    
    try:
        kw = html.xpath('/html/body/div[5]/main/div[2]/p/text()')[1].strip()
        print(kw)
    except:
        pass

def getLinks(url):
    r = requests.get(url).text
    html = etree.HTML(r)
    for num in range(1,11):
        link = html.xpath(f'//*[@id="search-results"]/section/div[2]/div/article[{num}]/div[2]/div[1]/a/@href')
        link = 'https://pubmed.ncbi.nlm.nih.gov' + link[0]
        print(link)
        getSigleParperInfo(link)
        print('\n\n==================\n\n')

url  = "https://pubmed.ncbi.nlm.nih.gov/?term=Atherosclerosis"
getLinks(url)

#目录页链接
#"https://pubmed.ncbi.nlm.nih.gov/?term=Atherosclerosis&page=2"
#"https://pubmed.ncbi.nlm.nih.gov/?term=Atherosclerosis&page=3"

翻页获取多篇文献信息

记录链接:

python
1
2
3
https://pubmed.ncbi.nlm.nih.gov/?term=Atherosclerosis
https://pubmed.ncbi.nlm.nih.gov/?term=Atherosclerosis&page=2
https://pubmed.ncbi.nlm.nih.gov/?term=Atherosclerosis&page=3

并且[https://pubmed.ncbi.nlm.nih.gov/?term=Atherosclerosis&page=1](https://pubmed.ncbi.nlm.nih.gov/?term=Atherosclerosis&page=1)可以打开第一页

python
1
2
3
4

for pageNum in range(1, 11):
    url = f"https://pubmed.ncbi.nlm.nih.gov/?term=Atherosclerosis&page={pageNum}"
    getLinks(url)

完整代码

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import requests
from lxml import etree

#url = "https://pubmed.ncbi.nlm.nih.gov/33883728/"

def getSigleParperInfo(url):
    r = requests.get(url).text
    html = etree.HTML(r)

    title = html.xpath('//*[@id="full-view-heading"]/h1/text()')[0].strip()
    print(title)

    authors = html.xpath('//*[@id="full-view-heading"]/div[2]/div/div/span/a/text()')
    authors = ','.join(authors)
    #print(authors) 

    pmID = html.xpath('//*[@id="full-view-identifiers"]/li[1]/span/strong/text()')[0]
    #print(pmID)

    mag = html.xpath('//*[@id="full-view-journal-trigger"]/text()')[0].strip()
    #print(mag)

    info_list = html.xpath('//*[@id="full-view-heading"]/div[1]/div[2]/span[2]/text()')
    if info_list:
        info_text = info_list[0].strip()
        info_parts = info_text.split(';')
        year = info_parts[0][:4] if info_parts else ""
        info = info_parts[1] if len(info_parts) > 1 else ""
        #print(info)
        #print(year)
        
    try:
        abstract = html.xpath('//*[@id="eng-abstract"]/p/text()')[0].strip()
        #print(abstract)
    except:
        pass
    
    try:
        kw = html.xpath('/html/body/div[5]/main/div[2]/p/text()')[1].strip()
        #print(kw)
    except:
        pass

def getLinks(url):
    r = requests.get(url).text
    html = etree.HTML(r)
    for num in range(1,11):
        link = html.xpath(f'//*[@id="search-results"]/section/div[2]/div/article[{num}]/div[2]/div[1]/a/@href')
        link = 'https://pubmed.ncbi.nlm.nih.gov' + link[0]
        print(link)
        getSigleParperInfo(link)
        print('\n\n==================\n\n')


for pageNum in range(1, 11):
    url = f"https://pubmed.ncbi.nlm.nih.gov/?term=Atherosclerosis&page={pageNum}"
    getLinks(url)

使用 pandas 处理数据

https://pandas.pydata.org/

python
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import requests
from lxml import etree
import pandas as pd

columns = ['Title', 'Authors', 'pmID','Mag', 'Info', 'Year', 'Abstract', 'Keywords']
papersInfo = []

def getSigleParperInfo(url):
    siglePaperInfo = []
    r = requests.get(url).text
    html = etree.HTML(r)

    title = html.xpath('//*[@id="full-view-heading"]/h1/text()')[0].strip()
    siglePaperInfo.append(title)

    authors = html.xpath('//*[@id="full-view-heading"]/div[2]/div/div/span/a/text()')
    authors = ','.join(authors)
    siglePaperInfo.append(authors)

    pmID = html.xpath('//*[@id="full-view-identifiers"]/li[1]/span/strong/text()')[0]
    siglePaperInfo.append(pmID)

    mag = html.xpath('//*[@id="full-view-journal-trigger"]/text()')[0].strip()
    siglePaperInfo.append(mag)

    info_list = html.xpath('//*[@id="full-view-heading"]/div[1]/div[2]/span[2]/text()')
    if info_list:
        info_text = info_list[0].strip()
        info_parts = info_text.split(';')
        year = info_parts[0][:4] if info_parts else ""
        info = info_parts[1] if len(info_parts) > 1 else ""
        siglePaperInfo.append(info)
        siglePaperInfo.append(year)

    # 初始化摘要和关键词为空字符串
    abstract = ""
    kw = ""
    
    try:
        abstract = html.xpath('//*[@id="eng-abstract"]/p/text()')[0].strip()
    except:
        pass
    
    try:
        kw = html.xpath('/html/body/div[5]/main/div[2]/p/text()')[1].strip()
    except:
        pass

    # 确保添加摘要和关键词
    siglePaperInfo.append(abstract)
    siglePaperInfo.append(kw)

    print(f"文章元素数量: {len(siglePaperInfo)}")

    print(siglePaperInfo)
    papersInfo.append(siglePaperInfo)


def getLinks(url):
    r = requests.get(url).text
    html = etree.HTML(r)
    for num in range(1,11):
        link = html.xpath(f'//*[@id="search-results"]/section/div[2]/div/article[{num}]/div[2]/div[1]/a/@href')
        link = 'https://pubmed.ncbi.nlm.nih.gov' + link[0]
        print(link)
        getSigleParperInfo(link)
        print('\n\n==================\n\n')


for pageNum in range(1, 3):
    url = f"https://pubmed.ncbi.nlm.nih.gov/?term=Atherosclerosis&page={pageNum}"
    getLinks(url)

df1 = pd.DataFrame(papersInfo, columns = columns)
df1.to_excel("PubMed文献信息.xlsx", index = False)