This data collection project consists of two mini-projects using API and Web Scraping.
In the first mini-project, I use API technique to collect job-based data from Job postings portals and stored in IBM cloud in order to identify trends on emerging programming languages and database skills.
In the second mini-project, I use Web Scraping technique to collect Amazon 30 desktop best sellers.
library(httr)
## Warning: package 'httr' was built under R version 4.2.2
library(jsonlite)
## Warning: package 'jsonlite' was built under R version 4.2.2
library(readr)
library(data.table)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.3.6 v dplyr 1.0.10
## v tibble 3.1.8 v stringr 1.4.1
## v tidyr 1.2.1 v forcats 0.5.2
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::between() masks data.table::between()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks data.table::first()
## x purrr::flatten() masks jsonlite::flatten()
## x dplyr::lag() masks stats::lag()
## x dplyr::last() masks data.table::last()
## x purrr::transpose() masks data.table::transpose()
Creating a GET response to call the API
emerging_skills <- GET("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/module%201/Accessing%20Data%20Using%20APIs/jobs.json")
Viewing API GET response result
str(emerging_skills)
## List of 10
## $ url : chr "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/module%201/A"| __truncated__
## $ status_code: int 200
## $ headers :List of 13
## ..$ date : chr "Sat, 19 Nov 2022 13:53:13 GMT"
## ..$ x-clv-request-id : chr "98932ba4-3174-44d9-aab8-7c40b2a63380"
## ..$ server : chr "Cleversafe"
## ..$ x-clv-s3-version : chr "2.5"
## ..$ accept-ranges : chr "bytes"
## ..$ x-amz-request-id : chr "98932ba4-3174-44d9-aab8-7c40b2a63380"
## ..$ x-amz-mp-parts-count: chr "2"
## ..$ cache-control : chr "max-age=0,public"
## ..$ etag : chr "\"444164853f0b0dc867ee9cbec13ef254-2\""
## ..$ content-type : chr "application/json"
## ..$ last-modified : chr "Tue, 28 Jun 2022 07:55:52 GMT"
## ..$ x-amz-storage-class : chr "standard"
## ..$ content-length : chr "12878382"
## ..- attr(*, "class")= chr [1:2] "insensitive" "list"
## $ all_headers:List of 1
## ..$ :List of 3
## .. ..$ status : int 200
## .. ..$ version: chr "HTTP/1.1"
## .. ..$ headers:List of 13
## .. .. ..$ date : chr "Sat, 19 Nov 2022 13:53:13 GMT"
## .. .. ..$ x-clv-request-id : chr "98932ba4-3174-44d9-aab8-7c40b2a63380"
## .. .. ..$ server : chr "Cleversafe"
## .. .. ..$ x-clv-s3-version : chr "2.5"
## .. .. ..$ accept-ranges : chr "bytes"
## .. .. ..$ x-amz-request-id : chr "98932ba4-3174-44d9-aab8-7c40b2a63380"
## .. .. ..$ x-amz-mp-parts-count: chr "2"
## .. .. ..$ cache-control : chr "max-age=0,public"
## .. .. ..$ etag : chr "\"444164853f0b0dc867ee9cbec13ef254-2\""
## .. .. ..$ content-type : chr "application/json"
## .. .. ..$ last-modified : chr "Tue, 28 Jun 2022 07:55:52 GMT"
## .. .. ..$ x-amz-storage-class : chr "standard"
## .. .. ..$ content-length : chr "12878382"
## .. .. ..- attr(*, "class")= chr [1:2] "insensitive" "list"
## $ cookies :'data.frame': 0 obs. of 7 variables:
## ..$ domain : logi(0)
## ..$ flag : logi(0)
## ..$ path : logi(0)
## ..$ secure : logi(0)
## ..$ expiration: 'POSIXct' num(0)
## ..$ name : logi(0)
## ..$ value : logi(0)
## $ content : raw [1:12878382] 5b 0a 20 7b ...
## $ date : POSIXct[1:1], format: "2022-11-19 13:53:13"
## $ times : Named num [1:6] 0 0.228 0.465 1.736 2.439 ...
## ..- attr(*, "names")= chr [1:6] "redirect" "namelookup" "connect" "pretransfer" ...
## $ request :List of 7
## ..$ method : chr "GET"
## ..$ url : chr "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/module%201/A"| __truncated__
## ..$ headers : Named chr "application/json, text/xml, application/xml, */*"
## .. ..- attr(*, "names")= chr "Accept"
## ..$ fields : NULL
## ..$ options :List of 2
## .. ..$ useragent: chr "libcurl/7.64.1 r-curl/4.3.2 httr/1.4.4"
## .. ..$ httpget : logi TRUE
## ..$ auth_token: NULL
## ..$ output : list()
## .. ..- attr(*, "class")= chr [1:2] "write_memory" "write_function"
## ..- attr(*, "class")= chr "request"
## $ handle :Class 'curl_handle' <externalptr>
## - attr(*, "class")= chr "response"
Viewing the main source of the data needed from the API
str(emerging_skills$content)
## raw [1:12878382] 5b 0a 20 7b ...
Converting the raw data to text
emerging_skills_content <- content(emerging_skills, "text")
## No encoding supplied: defaulting to UTF-8.
str(emerging_skills_content)
## chr "[\n {\n \"Id\": 0,\n \"Job Title\": \"Digital Media Planner\",\n \"Job Experience Required\": \"5 - 10 yr"| __truncated__
emerging_skills_JSON <- fromJSON(emerging_skills_content)
library(robotstxt)
## Warning: package 'robotstxt' was built under R version 4.2.2
library(rvest)
## Warning: package 'rvest' was built under R version 4.2.2
##
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
##
## guess_encoding
library(selectr)
library(xml2)
library(dplyr)
library(stringr)
library(forcats)
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(tidyr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(tibble)
library(purrr)
Checking whether we are allowed to access the website
paths_allowed(
paths = c ("https://www.amazon.com/Best-Sellers-Desktop-Computers/zgbs/electronics/565098"))
## Warning in get(name, envir = env, inherits = FALSE): strings not representable
## in native encoding will be translated to UTF-8
##
www.amazon.com
## [1] TRUE
Reading html
top_computers <- read_html("https://www.amazon.com/Best-Sellers-Desktop-Computers/zgbs/electronics/565098")
top_computers
## {html_document}
## <html lang="en-us" class="a-no-js" data-19ax5a9jf="dingo">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="a-m-us a-aui_72554-c a-aui_accordion_a11y_role_354025-c a-au ...
Extracting computer names
top_computers %>%
html_nodes("._cDEzb_p13n-sc-css-line-clamp-3_g3dy1") %>%
html_text() %>%
str_split(',') %>%
map_chr(1) -> Brand_name
Brand_name
## [1] "Dell OptiPlex Computer Desktop PC"
## [2] "CYBERPOWERPC Gamer Master Gaming PC"
## [3] "Dell Optiplex 9020 Small Form Factor Desktop with Intel Core i7-4770 Upto 3.9GHz"
## [4] "HP Elite Desktop PC Computer Intel Core i5 3.1-GHz"
## [5] "CYBERPOWERPC Gamer Xtreme VR Gaming PC"
## [6] "SkyTech Chronos Mini Gaming Computer PC Desktop - Intel Core-i3 10100F 3.6GHz"
## [7] "Dell Inspiron 3910 Desktop Computer Tower - 12th Gen Intel Core i5-12400"
## [8] "HP EliteDesk 800 G1 SFF High Performance Business Desktop Computer"
## [9] "BOMIX 2023 Newest H8 HTV8"
## [10] "ROG Strix G10 Gaming Desktop PC"
## [11] "HP Pavilion Gaming Desktop"
## [12] "CYBERPOWERPC Gamer Supreme Liquid Cool Gaming PC"
## [13] "HP Envy Desktop Bundle PC"
## [14] "Dell OptiPlex 9020-SFF"
## [15] "Beelink Mini S Intel 11th Gen N5095 Processor(4C/4T"
## [16] "Dell Desktop Computer Package Compatible with Dell Optiplex 7010 Intel Quad Core i5 3.2GHz"
## [17] "KAMRUI AK1 Pro Mini PC"
## [18] "iBUYPOWER Pro Gaming PC Computer Desktop Slate5MR 253i (Intel i3 10105F 3.7 GHz"
## [19] "Dell Optiplex 7050 SFF Desktop PC Intel i7-7700 4-Cores 3.60GHz 32GB DDR4 1TB SSD WiFi BT HDMI Duel Monitor Support Windows 10 Pro Excellent Condition(Renewed)"
## [20] "HP Elite 800G1 Desktop Computer Package - Intel Quad Core i5 3.3GHz"
## [21] "Acer Aspire TC-1760-UA92 Desktop | 12th Gen Intel Core i5-12400 6-Core Processor | 12GB 3200MHz DDR4 | 512GB NVMe M.2 SSD | 8X DVD | Intel Wireless Wi-Fi 6 AX201 | Bluetooth 5.2 | Windows 11 Home"
## [22] "Dell Optiplex 9010 SFF Desktop Computer - Intel i7-3770 Upto 3.9GHz"
## [23] "Dell OptiPlex 7050 Micro Computer"
## [24] "Microsoft Authorized Refurbished- HP Elite Desktop PC Computer Intel Core i5 3.1-GHz"
## [25] "Skytech Blaze II Gaming PC Desktop – AMD Ryzen 5 5600G 3.9 GHz"
## [26] "Dell Optiplex 7010 Business Desktop Computer (Intel Quad Core i5-3470 3.2GHz"
## [27] "Apple iMac 21.5in 2.7GHz Core i5 (ME086LL/A) All In One Desktop"
## [28] "HP ProDesk Desktop RGB Lights Computer Intel Core i5 4570 3.2 GHz 8GB RAM 256GB SSD Win 10 Pro WiFi"
## [29] "HP ProDesk 600 G1 SFF Slim Business Desktop Computer"
## [30] "2020 Apple iMac with Retina 5K Display (27-inch"
Extracting computer prices
top_computers %>%
html_nodes(".a-link-normal span") %>%
html_text() %>%
str_extract(pattern = "^\\$.*") %>%
na.omit() %>%
unique() -> Price
Price
## [1] "$257.00" "$849.99" "$266.11" "$139.00" "$769.99" "$599.99"
## [7] "$548.00" "$134.50" "$205.00" "$1,449.99" "$529.00" "$1,269.99"
## [13] "$1,599.99" "$138.00" "$159.00" "$132.99" "$199.94" "$1,099.99"
## [19] "$337.99" "$250.00" "$565.61" "$226.69" "$137.99" "$130.99"
## [25] "$899.99" "$135.00" "$279.99" "$134.99" "$89.99" "$1,195.55"
Extracting computer ratings
top_computers %>%
html_nodes(".a-icon-alt") %>%
html_text() %>%
str_split('out') %>%
map_chr(1) %>%
str_trim()-> Rating
Rating
## [1] "4.1" "4.6" "4.3" "4.0" "4.6" "4.4" "4.3" "4.3" "4.4" "3.7" "4.3" "4.2"
## [13] "4.7" "4.4" "4.8" "4.2" "4.4" "4.4" "4.1" "4.2" "4.5" "3.7" "4.3" "4.0"
## [25] "4.3" "4.2" "4.3" "4.0" "4.3" "4.7"
Merging data frames into one data frame named Amazon_Computer_Best_Sellers
Amazon_Computer_Best_Sellers <- tibble(Brand_name, Price, Rating)
Amazon_Computer_Best_Sellers
## # A tibble: 30 x 3
## Brand_name Price Rating
## <chr> <chr> <chr>
## 1 Dell OptiPlex Computer Desktop PC $257~ 4.1
## 2 CYBERPOWERPC Gamer Master Gaming PC $849~ 4.6
## 3 Dell Optiplex 9020 Small Form Factor Desktop with Intel Core i7~ $266~ 4.3
## 4 HP Elite Desktop PC Computer Intel Core i5 3.1-GHz $139~ 4.0
## 5 CYBERPOWERPC Gamer Xtreme VR Gaming PC $769~ 4.6
## 6 SkyTech Chronos Mini Gaming Computer PC Desktop - Intel Core-i3~ $599~ 4.4
## 7 Dell Inspiron 3910 Desktop Computer Tower - 12th Gen Intel Core~ $548~ 4.3
## 8 HP EliteDesk 800 G1 SFF High Performance Business Desktop Compu~ $134~ 4.3
## 9 BOMIX 2023 Newest H8 HTV8 $205~ 4.4
## 10 ROG Strix G10 Gaming Desktop PC $1,4~ 3.7
## # ... with 20 more rows