1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
| ## Paste the URL of your search result
url <- “http://www.finn.no/finn/realestate/homes/result?keyword=&PRICE_FROM=&PRICE_TO=5000000&ESTATE_SIZE%2FLIVING_AREA_FROM=80&ESTATE_SIZE%2FLIVING_AREA_TO=&areaId=20045&areaId=20046&NO_OF_BEDROOMS=3&PLOT%2FAREARANGE_FROM=&PLOT%2FAREARANGE_TO=&rows=50&sort=1”
## If there is no “page” (by default) in URL, add it.
if (!grepl(“page=[[:digit:]]+”, url)) {
url <- paste(url, “page=1”, sep = “&”)
}
## Load libraries needed
library(RCurl)
library(googleVis)
library(RgoogleMaps)
## Create a function for extracting xml fragment of interested information.
xml.tag <- function(xml = xml, tag.1 = “<div”, tag.2 = “</div>”, ptn = “mod mtn mhn mbs”) {
ind.1 <- data.frame(id = gregexpr(tag.1, xml)[[1]], v = 1)
ind.2 <- data.frame(id = gregexpr(tag.2, xml)[[1]], v = -1)
ind.3 <- rbind(ind.1, ind.2)
ind.3 <- ind.3[order(ind.3$id), ]<br />
pos <- data.frame(id = gregexpr(ptn, xml)[[1]], start = NA, end = NA)
for (p in 1:nrow(pos)) {
ind <- ind.3[length(which(ind.3$id < pos$id[p])):nrow(ind.3), ]
m <- i <- 1
repeat{
i <- i + 1
m <- m + ind$v[i]
if (m == 0) break
}
pos$start[p] <- ind$id[1]
pos$end [p] <- ind$id[i]+nchar(tag.2)
}
tag <- rep(NA, nrow(pos))
for (i in 1:length(tag)) tag[i] <- substr(xml, pos$start[i], pos$end[i])
return(tag)
}</p>
<h2 id="downlaod-each-ad">Downlaod each ad;</h2>
<p>xml <- getURL(url)
n <- as.numeric(regmatches(xml, regexec(“resultlist-counter">([0-9]+)<”, xml))[[1]][2])
Res <- NULL
for (pg in 1:ceiling(n/50)) { print(pg)
url.pg <- gsub(“page=[[:digit:]]+”, paste(“page”, pg, sep = “=”), url)
xml <- xml.tag(xml = getURL(url.pg))
# Transform html entity characters to displaying characters;
xml <- gsub(“\n|\t|\v”, “”, xml)
xml <- gsub(“ | ”, “ “, xml)
xml <- gsub(“&”, “&”, xml)
xml <- gsub(““”, “’”, xml)
xml <- gsub(“””, “’”, xml)
xml <- gsub(“²”, “2”, xml)
xml <- gsub(“'”, “’”, xml)
# Create a data frame for holding the information for one web page;
res <- data.frame(Size = rep(NA, length(xml)), Price = NA, Addr = NA, Img = NA, Title = NA, Link = NA, Year = NA)
for (i in 1:nrow(res)) {
# xml fragment for rome Size and Price per month;
mbm <- xml.tag(xml = xml[i], ptn = “line mbl”)
mbm.Img <- xml.tag(xml = xml[i], ptn = “img”)[1]
mbm.Add <- xml.tag(xml = xml[i], ptn = “unit size1of2 neutral”)
mbm.Size <- xml.tag(xml = mbm, ptn = “unit size1of3 keyinfo”)[1]
mbm.Price <- xml.tag(xml = mbm, ptn = “unit size1of3 lastUnit keyinfo”)
## XML containing special data
Size <- gsub(“^ +| +$”, “”, paste(regmatches(mbm.Size, gregexpr(“<.<em>?>”, mbm.Size), invert = T)[[1]], collapse = “”))
Price <- gsub(“^ +| +$”, “”, paste(regmatches(mbm.Price, gregexpr(“<.</em>?>”, mbm.Price), invert = T)[[1]], collapse = “”))
Link <- regmatches(mbm.Img, regexec(“(http.<em>?)"”, mbm.Img))[[1]][2]
Img <- regmatches(mbm.Img, regexec(“<img src="(.</em>?)"”, mbm.Img))[[1]][2]
Addr <- grep(“[[:alnum:]]”, regmatches(mbm.Add, gregexpr(“<.<em>?>”, mbm.Add), invert = T)[[1]], value = TRUE)[3]
if (is.na(Addr))
Addr <- grep(“[[:alnum:]]”, regmatches(mbm.Add, gregexpr(“<.</em>?>”, mbm.Add), invert = T)[[1]], value = TRUE)[2]
xml.ad <- getURL(url = Link)
Year <- regmatches(xml.ad, regexec(“<dt>Bygge.r</dt>.<em>?<dd>([[:digit:]]{4})</dd>”, xml.ad))[[1]][2]
Title <- gsub(“^ +| +$”, “”, paste(regmatches(mbm.Img, gregexpr(“<.</em>?>”, mbm.Img), invert = T)[[1]], collapse = “”))
# Extract useful information;
res$Size[i] <- Size
res$Price[i] <- gsub(“?|fra|til”, “”, Price)
res$Title[i] <- Title
res$Img[i] <- Img
res$Addr[i] <- Addr
res$Link[i] <- Link
res$Year[i] <- Year
}
Res <- rbind(Res, res)
}
Res <- Res[Res$Year >= 2000 & Res$Year =< 2010 & !is.na(Res$Year),]
## Geocoding the post nr. of Oslo using Google Geocoding API;
if (nrow(Res) > 0) {
gapi <- “http://maps.googleapis.com/maps/api/geocode/xml?sensor=false&address=”
for (i in 1:nrow(Res)) { print(i)
url <- gsub(“ “, “%20”, paste(gapi, paste(Res$Addr[i], “Norway”, sep = “ “), sep = “”))
url <- gsub(“Å|å”, “a”, url)
url <- gsub(“Ø|ø”, “o”, url)
url <- gsub(“Æ|æ”, “ae”, url)
xml <- getURL(url); Sys.sleep(.5)
Res$Lon[i] <- as.numeric(regmatches(xml, regexec(“<lng>(.+?)</lng>”, xml))[[1]][2])
Res$Lat[i] <- as.numeric(regmatches(xml, regexec(“<lat>(.+?)</lat>”, xml))[[1]][2])
}
Res <- Res[!is.na(Res$Lat),]
Res$LatLong <- paste(Res$Lat, Res$Lon, sep = “:”)
Res$Tip <- paste(“<a href=", Res$Link, "><img src=", Res$Img, " /></a>”, sep = “"”)
Res$Tip <- paste(Res$Tip, Res$Title, Res$Size, Res$Price, Res$Year, sep = “<br />”)
M <- gvisMap(Res, “LatLong” , “Tip”,
options=list(showTip=TRUE, enableScrollWheel=TRUE,
mapType=’hybrid’, useMapTypeControl=TRUE,
width=800,height=400))</p>
<p>cat(M$html$chart, file = “c:/gmap.html”)
browseURL( “c:/gmap.html”)
}
|