from bs4 import BeautifulSoup # beautifulsoup4
import requests
import pandas as pd
import numpy as np
import re
library(rvest)
library(dplyr)
Within Novartis, we need proxy to get into some websites.
url = "https://www.lexjansen.com"
proxyDict = {'https': xxx}
# Interact with data via a REST API
# Returns a <response> object
r = requests.get(url, proxies=proxyDict, verify=False)
# for css selector
soup = BeautifulSoup(r.text, 'lxml')
# for XPath
tree = html.fromstring(r.content)
# read url
website <- "https://www.lexjansen.com"
lex <- read_html(website)
Different parsers:
Paper information | |
---|---|
Title | A Case Study of Mining Social Media Data for Disaster Relief: Hurricane Irma |
Link | https://www.sas.com/content/dam/SAS/support/en/sas-global-forum-proceedings/2018/2695-2018.pdf |
Author | Bogdan Gadidov, Linh Le |
Keyword | Text Mining Topic Modeling Time Series |
Pages | 11 |
Sizes | 660 kb |
Conference information | |
---|---|
Conference name | SAS Global Forum 2018 |
Conference place | Denver, Colorado |
Conference time | April 8-11, 2018 |
Paper attributes | |
---|---|
Section name | Breakout |
Best paper flag |
<a href="/sugi">SUGI / SAS Global Forum</a> papers (1976-2021)
# CSS selector
soup.select("div[id = 'sasproceedings'] > ul > li > div > div > span > a")
soup.select("div[id = 'sasproceedings'] a")
# XPath
tree.xpath('//div[@id ="sasproceedings"]/ul/li/div/div/span/a/text()')
tree.xpath('//div[@id ="sasproceedings"]/descendant::a/text()')
# Get SUGI URL
href = soup.select("div[id = 'sasproceedings'] a")[10].get('href')
sugi_url = url + href
#sublink1
uri1 <- lex %>% html_nodes("span a") %>% html_attr("href")
sublink1 <- paste0(website,uri1)
sublink1[6]
conf.ch <- read_html(sublink1[6])
[1] “https://www.lexjansen.com/sugi"
<a href="../cgi-bin/xsl_transform.php?x=sgf2018">SAS Global Forum 2018</a>
<span>April 8-11, 2018</span>
<span>Denver, Colorado</span>
r_sugi = requests.get(sugi_url, proxies=proxyDict, verify=False)
soup2 = BeautifulSoup(r_sugi.text, 'lxml')
tree2 = html.fromstring(r_sugi.content)
# SUGI Forun 2018 url
soup2.select('li a')[3].get('href')
sugi_2018_url = url + soup2.select('li a')[3].get('href')[2:]
# 1. conference name
# CSS selector
soup2.select("li a")[3].text
# XPath
tree2.xpath('//li/a/text()')[3]
# 2. conference time
# CSS selector
soup2.select("li[class = 'conference']")[3].select("span")[0].text
# XPath: first <span> element under <li>
tree2.xpath('//li/span[1]/text()')[3]
# 3. conference place
# CSS selector
soup2.select("li[class = 'conference']")[3].select("span")[1].text
# XPath: second <span> element under <li>
tree2.xpath('//li/span[2]/text()')[3]
# SUGI Forun 2018 link
uri2 <- conf.ch %>% html_nodes("li a") %>% html_attr("href")
sublink2 <- paste0(website,substring(uri2,3))
conf.ch.2018 <- read_html(sublink2[4])
# conference name
conf.name <- conf.ch %>% html_nodes("li a") %>% html_text()
(conf.name.f <- conf.name[4])
# conference info (time, place)
conf.info <- conf.ch %>% html_nodes("li span") %>% html_text()
#time
conf.time <- conf.info[seq(1,length(conf.info),2)]
(conf.time.f <- conf.time[4])
#place
conf.place <- conf.info[seq(2,length(conf.info),2)]
(conf.place.f <- conf.place[4])
[1] “SAS Global Forum 2018”
[1] “April 8-11, 2018”
[1] “Denver, Colorado”
# create soup3 & tree3
r_sugi_2018 = requests.get(sugi_2018_url, proxies=proxyDict, verify=False)
soup3 = BeautifulSoup(r_sugi_2018.text, 'lxml')
tree3 = html.fromstring(r_sugi_2018.content)
<a taget="_blank">
tag
<a target="_blank" href="https://www.sas.com/content/dam/SAS/support/en/sas-global-forum-proceedings/2018/2695-2018.pdf"
>A Case Study of Mining Social Media Data for Disaster Relief: Hurricane Irma</a>
# CSS selector
soup3.select('div.paper > a[target = "_blank"]')[2].text
# XPath: find the second <a> element, which is the child of div.paper.wh
tree3.xpath('//div[contains(@class, "paper")]/child::a[2]/text()')[2]
# title
pname <- conf.ch.2018 %>% html_nodes(xpath="//*[@target='_blank']") %>% html_text()
pname.f <- pname[which(pname != "")]
(pa.name.f <- pname.f[3])
[1] “A Case Study of Mining Social Media Data for Disaster Relief: Hurricane Irma”
Link:
# CSS selector
soup3.select('div.paper > a[target = "_blank"]')[2].get('href')
# XPath:
tree3.xpath('//div[contains(@class, "paper")]/child::a[2]/@href')[2]
# paper link
filelink <- conf.ch.2018 %>% html_nodes(xpath="//*[@target='_blank']") %>% html_attr("href")
filelink.f <- filelink[which(filelink != "")]
(pa.link <- filelink.f[3])
[1] “https://www.sas.com/content/dam/SAS/support/en/sas-global-forum-proceedings/2018/2695-2018.pdf"
Author: Under <a>
tag
<a href="/cgi-bin/xsl_transform.php?x=ag&c=SUGI#bogdidov">Bogdan Gadidov</a>
<a href="/cgi-bin/xsl_transform.php?x=al&c=SUGI#linhnhle">Linh Le</a>
# beautifulsoup syntax
soup3.find_all("div", {"class": "paper"})[2].find_all('a', id=None, target=None)
# XPath: within the <div>, with attribute contains "paper", find <a> tag without attributes "id" and "target"
tree3.xpath('//div[contains(@class, "paper")][3]/a[not(@id) and not(@target)]/text()')
# author
author <- conf.ch.2018 %>% html_nodes(xpath="//div[contains(@class, 'paper')][3]/a[not(@id) and not(@target)]") %>% html_text()
(author.f <- paste0(author[seq(1,length(author),2)],", ",author[seq(2,length(author),2)]))
[1] “Bogdan Gadidov, Linh Le”
Keyword: Under <span>
tag
<span class="key"><b>Keywords:</b> Text Mining Topic Modeling Time Series </span>
# CSS selector
soup3.select('div.paper > span[class = "key"]')[0].text
# XPath: within the <div>, with attribute contains "paper", find the <span> tag with attribute class="key"
tree3.xpath('//div[contains(@class, "paper")][3]/span[@class = "key"]/text()')
# keyword
keyw <- conf.ch.2018 %>% html_nodes("div.paper span.key") %>% html_text()
keyword <- sub(".*: ", "",keyw)
(keyword.f <- keyword[1])
[1] “Text Mining Topic Modeling Time Series”
Page & Size: Under <span>
tag
<span class="size"><b>Pages</b>: 11 </span>
<span xmlns:gcse="uri:dummy-google-ns" class="size"><b>Size</b>: 660 Kb </span>
# CSS selector
soup3.select('div.paper')[2].select('span[class = "size"]')
# XPath: within the <div>, with attribute contains "paper", find the <span> tag with attribute class="size"
tree3.xpath('//div[contains(@class, "paper")][3]/span[@class = "size"]/text()')
# page + size
ps <- conf.ch.2018 %>% html_nodes("div.paper span.size") %>% html_text()
page <- ps[seq(1,length(ps),2)]
size <- ps[seq(2,length(ps),2)]
(page.f <- page[3])
(size.f <- size[3])
[1] “Pages: 11 "
[1] “Size: 660 Kb "
Only grab section name & title from the url
def title_grab(url):
r_sugi = requests.get(url, proxies=proxyDict, verify=False)
soup = BeautifulSoup(r_sugi.text, 'lxml')
num_paper = len(soup.select('div.paper'))
title = []
for i in range(num_paper):
div = soup.find_all("div", {"class": "paper"})[i]
# print(i)
t = div.find("a").findNextSibling()
if not t:
print("No title")
else:
title.append(t.text)
title_df = pd.DataFrame({'Title' : title})
return title_df
def section_grab(url):
s = requests.get(url, proxies=proxyDict, verify=False)
soup = BeautifulSoup(s.text, 'lxml')
num_stream = len(soup.select("div[class='streams'] span"))
df_list = []
for i in range(num_stream):
# print(i)
x = soup.find_all("span", {"class": "stream"})[i]
sec_name = x.text
sec_url = 'https://www.lexjansen.com' + x.find("a").get('href')[2:]
df = title_grab(sec_url)
df.insert(column = "Section_name", value = sec_name, loc=0)
df_list.append(df)
return df_list
section = section_grab("https://www.lexjansen.com/sugi/")
Sections = pd.concat([pd.DataFrame(section[x]) for x in range(num_section)], axis = 0, ignore_index=True)
Sections.to_csv('Sections.csv', index = False)
sct <- conf.ch %>% html_nodes("div.streams span.stream") %>% html_text
section_name <- sct[7]
#get the section links
uri3 <- conf.ch %>% html_nodes("div.streams span.stream a") %>% html_attr("href")
sublink3 <- paste0(website, substring(uri3,3))
sugi.section <- read_html(sublink3[7])
#get paper titles within the section
sct_papers <- sugi.section %>% html_nodes("div.paper a") %>% html_text
section.f <- ifelse(pa.name.f %in% sct_papers, section_name, "")
sugi = pd.read_excel('SUGI.xlsx')
section = pd.read_excel('Sections.xlsx')
pd.merge(sugi, section, on="Title", how='left')
Only grab the title from the url, and provide “Y” to column “Best_paper_fl”
def best_paper_title(url):
r_sugi = requests.get(url, proxies=proxyDict, verify=False)
soup = BeautifulSoup(r_sugi.text, 'lxml')
title = []
best = soup.select("div.paperback > a")
if best:
for pp in best:
title.append(pp.text)
paper_df = pd.DataFrame({'Title' : title})
return paper_df
def best_paper_fl(url):
r_sugi = requests.get(url, proxies=proxyDict, verify=False)
soup = BeautifulSoup(r_sugi.text, 'lxml')
paper = soup.select("li a")
df_list = []
for i in range(len(paper)):
# print(i)
conf_url = 'https://www.lexjansen.com' + paper[i].get('href')[2:]
df = best_paper_title(conf_url)
df.insert(column = "Best_paper_fl", value = "Y", loc=0)
df_list.append(df)
return df_list
best = best_paper_fl("https://www.lexjansen.com/sugi/")
best = pd.concat([pd.DataFrame(best[x]) for x in range(46)], axis = 0, ignore_index=True)
best.to_csv('Best_paper.csv', index = False)
sugi = pd.read_excel('SUGI.xlsx')
section = pd.read_excel('Sections.xlsx')
best = pd.read_excel('Best_paper.xlsx')
final = sugi.merge(section, on='Title', how='left').merge(best, on='Title', how='left')
final.to_csv('SUGI_paper.csv', index = False)