Skip to content

Commit 85f47c8

Browse files
authored
WebScraper_WebCrawler.py
This script scrapes and crawls RE webpages.
1 parent 959218f commit 85f47c8

File tree

1 file changed

+75
-0
lines changed

1 file changed

+75
-0
lines changed

Diff for: web_crawler.py

+75
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
import pandas
4+
5+
r=requests.get("http://www.pythonhow.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/")
6+
7+
# store content of requests object...
8+
c=r.content
9+
#print(c)
10+
11+
soup=BeautifulSoup(c,"html.parser")
12+
#print(soup.prettify())
13+
14+
# generate's a list object of all <div> elements...
15+
all=soup.find_all("div",{"class":"propertyRow"})
16+
17+
all[0].find("h4",{"class":"propPrice"}).text.replace("\n","")
18+
19+
page_nr=soup.find_all("a",{"class":"Page"})[-1].text # soup returns a string...
20+
print(page_nr) # page_nr is a string... you must convert to int...
21+
22+
l=[]
23+
base_url="http://www.pythonhow.com/real-estate/rock-springs-wy/LCWYROCKSPRINGS/t=0&s="
24+
for page in range(0,int(page_nr)*10,10): # start at 0, upto page, and iterate by 10...
25+
print(base_url+str(page)+"html")
26+
r=requests.get(base_url+str(page)+".html")
27+
c=r.content
28+
soup=BeautifulSoup(c,"html.parser") # BeautifulSoup returns a list object...
29+
all=soup.find_all("div",{"class":"propertyRow"})
30+
for item in all:
31+
# Dictionary, key/value pairs to store the scraped property info...
32+
d={}
33+
item.find() # grabs first occurence of item being searched
34+
# span tag below has two elements of data, ergo index 0 and index 1
35+
d["Address"]=item.find_all("span",{"class","propAddressCollapse"})[0].text # returns a list with specified criterior result...
36+
try:
37+
d["Locality"]=item.find_all("span",{"class","propAddressCollapse"})[1].text
38+
except:
39+
d["Locality"]=None
40+
41+
d["Price"]=item.find("h4",{"class","propPrice"}).text.replace("\n","").replace(" ","")
42+
try:
43+
d["Beds"]=item.find("span",{"class","infoBed"}).find("b").text
44+
except:
45+
d["Beds"]=None
46+
47+
try:
48+
d["Area"]=item.find("span",{"class","infoSqFt"}).find("b").text
49+
except:
50+
#pass # this makes python ignore any would be exception...
51+
d["Area"]=None
52+
try:
53+
d["Full Baths"]=item.find("span",{"class","infoValueFullBath"}).find("b").text
54+
except:
55+
d["Full Baths"]=None
56+
57+
try:
58+
d["Half Baths"]=item.find("span",{"class","infoValueHalfBath"}).find("b").text
59+
except:
60+
d["Half Baths"]=None
61+
62+
for column_group in item.find_all("div",{"class":"columnGroup"}):
63+
print(column_group)
64+
for feature_group, feature_name in zip(column_group.find_all("span",{"class":"featureGroup"}),column_group.find_all("span",{"class":"featureName"})):
65+
print(feature_group.text, feature_name.text)
66+
if "Lot Size" in feature_group.text:
67+
d["Lot Size"]=feature_name.text
68+
69+
# append the dict to the list...
70+
l.append(d)
71+
72+
import pandas
73+
df=pandas.DataFrame(l)
74+
df
75+
df.to_csv("martyOutput.csv")

0 commit comments

Comments
 (0)