Executive Summary

This data collection project consists of two mini-projects using API and Web Scraping.

In the first mini-project, I use API technique to collect job-based data from Job postings portals and stored in IBM cloud in order to identify trends on emerging programming languages and database skills.

In the second mini-project, I use Web Scraping technique to collect Amazon 30 desktop best sellers.

Data Collection from IBM using API

library(httr)
## Warning: package 'httr' was built under R version 4.2.2
library(jsonlite)
## Warning: package 'jsonlite' was built under R version 4.2.2
library(readr)
library(data.table)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.3.6      v dplyr   1.0.10
## v tibble  3.1.8      v stringr 1.4.1 
## v tidyr   1.2.1      v forcats 0.5.2 
## v purrr   0.3.4      
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::between()   masks data.table::between()
## x dplyr::filter()    masks stats::filter()
## x dplyr::first()     masks data.table::first()
## x purrr::flatten()   masks jsonlite::flatten()
## x dplyr::lag()       masks stats::lag()
## x dplyr::last()      masks data.table::last()
## x purrr::transpose() masks data.table::transpose()

Creating a GET response to call the API

emerging_skills <- GET("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/module%201/Accessing%20Data%20Using%20APIs/jobs.json")

Viewing API GET response result

str(emerging_skills)
## List of 10
##  $ url        : chr "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/module%201/A"| __truncated__
##  $ status_code: int 200
##  $ headers    :List of 13
##   ..$ date                : chr "Sat, 19 Nov 2022 13:53:13 GMT"
##   ..$ x-clv-request-id    : chr "98932ba4-3174-44d9-aab8-7c40b2a63380"
##   ..$ server              : chr "Cleversafe"
##   ..$ x-clv-s3-version    : chr "2.5"
##   ..$ accept-ranges       : chr "bytes"
##   ..$ x-amz-request-id    : chr "98932ba4-3174-44d9-aab8-7c40b2a63380"
##   ..$ x-amz-mp-parts-count: chr "2"
##   ..$ cache-control       : chr "max-age=0,public"
##   ..$ etag                : chr "\"444164853f0b0dc867ee9cbec13ef254-2\""
##   ..$ content-type        : chr "application/json"
##   ..$ last-modified       : chr "Tue, 28 Jun 2022 07:55:52 GMT"
##   ..$ x-amz-storage-class : chr "standard"
##   ..$ content-length      : chr "12878382"
##   ..- attr(*, "class")= chr [1:2] "insensitive" "list"
##  $ all_headers:List of 1
##   ..$ :List of 3
##   .. ..$ status : int 200
##   .. ..$ version: chr "HTTP/1.1"
##   .. ..$ headers:List of 13
##   .. .. ..$ date                : chr "Sat, 19 Nov 2022 13:53:13 GMT"
##   .. .. ..$ x-clv-request-id    : chr "98932ba4-3174-44d9-aab8-7c40b2a63380"
##   .. .. ..$ server              : chr "Cleversafe"
##   .. .. ..$ x-clv-s3-version    : chr "2.5"
##   .. .. ..$ accept-ranges       : chr "bytes"
##   .. .. ..$ x-amz-request-id    : chr "98932ba4-3174-44d9-aab8-7c40b2a63380"
##   .. .. ..$ x-amz-mp-parts-count: chr "2"
##   .. .. ..$ cache-control       : chr "max-age=0,public"
##   .. .. ..$ etag                : chr "\"444164853f0b0dc867ee9cbec13ef254-2\""
##   .. .. ..$ content-type        : chr "application/json"
##   .. .. ..$ last-modified       : chr "Tue, 28 Jun 2022 07:55:52 GMT"
##   .. .. ..$ x-amz-storage-class : chr "standard"
##   .. .. ..$ content-length      : chr "12878382"
##   .. .. ..- attr(*, "class")= chr [1:2] "insensitive" "list"
##  $ cookies    :'data.frame': 0 obs. of  7 variables:
##   ..$ domain    : logi(0) 
##   ..$ flag      : logi(0) 
##   ..$ path      : logi(0) 
##   ..$ secure    : logi(0) 
##   ..$ expiration: 'POSIXct' num(0) 
##   ..$ name      : logi(0) 
##   ..$ value     : logi(0) 
##  $ content    : raw [1:12878382] 5b 0a 20 7b ...
##  $ date       : POSIXct[1:1], format: "2022-11-19 13:53:13"
##  $ times      : Named num [1:6] 0 0.228 0.465 1.736 2.439 ...
##   ..- attr(*, "names")= chr [1:6] "redirect" "namelookup" "connect" "pretransfer" ...
##  $ request    :List of 7
##   ..$ method    : chr "GET"
##   ..$ url       : chr "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/module%201/A"| __truncated__
##   ..$ headers   : Named chr "application/json, text/xml, application/xml, */*"
##   .. ..- attr(*, "names")= chr "Accept"
##   ..$ fields    : NULL
##   ..$ options   :List of 2
##   .. ..$ useragent: chr "libcurl/7.64.1 r-curl/4.3.2 httr/1.4.4"
##   .. ..$ httpget  : logi TRUE
##   ..$ auth_token: NULL
##   ..$ output    : list()
##   .. ..- attr(*, "class")= chr [1:2] "write_memory" "write_function"
##   ..- attr(*, "class")= chr "request"
##  $ handle     :Class 'curl_handle' <externalptr> 
##  - attr(*, "class")= chr "response"

Viewing the main source of the data needed from the API

str(emerging_skills$content)
##  raw [1:12878382] 5b 0a 20 7b ...

Converting the raw data to text

emerging_skills_content <- content(emerging_skills, "text")
## No encoding supplied: defaulting to UTF-8.
str(emerging_skills_content)
##  chr "[\n {\n   \"Id\": 0,\n   \"Job Title\": \"Digital Media Planner\",\n   \"Job Experience Required\": \"5 - 10 yr"| __truncated__

emerging_skills_JSON <- fromJSON(emerging_skills_content)

Data Collection from Amazon using Web Scraping

library(robotstxt)
## Warning: package 'robotstxt' was built under R version 4.2.2
library(rvest)
## Warning: package 'rvest' was built under R version 4.2.2
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
## 
##     guess_encoding
library(selectr)

library(xml2)

library(dplyr)

library(stringr)

library(forcats)

library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
library(tidyr)

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(tibble)

library(purrr)

Checking whether we are allowed to access the website

paths_allowed(
  
  paths = c ("https://www.amazon.com/Best-Sellers-Desktop-Computers/zgbs/electronics/565098"))
## Warning in get(name, envir = env, inherits = FALSE): strings not representable
## in native encoding will be translated to UTF-8
## 
 www.amazon.com
## [1] TRUE

Reading html

top_computers <- read_html("https://www.amazon.com/Best-Sellers-Desktop-Computers/zgbs/electronics/565098")
top_computers
## {html_document}
## <html lang="en-us" class="a-no-js" data-19ax5a9jf="dingo">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body class="a-m-us a-aui_72554-c a-aui_accordion_a11y_role_354025-c a-au ...

Extracting computer names

top_computers %>%
  html_nodes("._cDEzb_p13n-sc-css-line-clamp-3_g3dy1") %>%
  html_text() %>%
  str_split(',') %>%
  map_chr(1) -> Brand_name
Brand_name
##  [1] "Dell OptiPlex Computer Desktop PC"                                                                                                                                                                  
##  [2] "CYBERPOWERPC Gamer Master Gaming PC"                                                                                                                                                                
##  [3] "Dell Optiplex 9020 Small Form Factor Desktop with Intel Core i7-4770 Upto 3.9GHz"                                                                                                                   
##  [4] "HP Elite Desktop PC Computer Intel Core i5 3.1-GHz"                                                                                                                                                 
##  [5] "CYBERPOWERPC Gamer Xtreme VR Gaming PC"                                                                                                                                                             
##  [6] "SkyTech Chronos Mini Gaming Computer PC Desktop - Intel Core-i3 10100F 3.6GHz"                                                                                                                      
##  [7] "Dell Inspiron 3910 Desktop Computer Tower - 12th Gen Intel Core i5-12400"                                                                                                                           
##  [8] "HP EliteDesk 800 G1 SFF High Performance Business Desktop Computer"                                                                                                                                 
##  [9] "BOMIX 2023 Newest H8 HTV8"                                                                                                                                                                          
## [10] "ROG Strix G10 Gaming Desktop PC"                                                                                                                                                                    
## [11] "HP Pavilion Gaming Desktop"                                                                                                                                                                         
## [12] "CYBERPOWERPC Gamer Supreme Liquid Cool Gaming PC"                                                                                                                                                   
## [13] "HP Envy Desktop Bundle PC"                                                                                                                                                                          
## [14] "Dell OptiPlex 9020-SFF"                                                                                                                                                                             
## [15] "Beelink Mini S Intel 11th Gen N5095 Processor(4C/4T"                                                                                                                                                
## [16] "Dell Desktop Computer Package Compatible with Dell Optiplex 7010 Intel Quad Core i5 3.2GHz"                                                                                                         
## [17] "KAMRUI AK1 Pro Mini PC"                                                                                                                                                                             
## [18] "iBUYPOWER Pro Gaming PC Computer Desktop Slate5MR 253i (Intel i3 10105F 3.7 GHz"                                                                                                                    
## [19] "Dell Optiplex 7050 SFF Desktop PC Intel i7-7700 4-Cores 3.60GHz 32GB DDR4 1TB SSD WiFi BT HDMI Duel Monitor Support Windows 10 Pro Excellent Condition(Renewed)"                                    
## [20] "HP Elite 800G1 Desktop Computer Package - Intel Quad Core i5 3.3GHz"                                                                                                                                
## [21] "Acer Aspire TC-1760-UA92 Desktop | 12th Gen Intel Core i5-12400 6-Core Processor | 12GB 3200MHz DDR4 | 512GB NVMe M.2 SSD | 8X DVD | Intel Wireless Wi-Fi 6 AX201 | Bluetooth 5.2 | Windows 11 Home"
## [22] "Dell Optiplex 9010 SFF Desktop Computer - Intel i7-3770 Upto 3.9GHz"                                                                                                                                
## [23] "Dell OptiPlex 7050 Micro Computer"                                                                                                                                                                  
## [24] "Microsoft Authorized Refurbished- HP Elite Desktop PC Computer Intel Core i5 3.1-GHz"                                                                                                               
## [25] "Skytech Blaze II Gaming PC Desktop – AMD Ryzen 5 5600G 3.9 GHz"                                                                                                                                     
## [26] "Dell Optiplex 7010 Business Desktop Computer (Intel Quad Core i5-3470 3.2GHz"                                                                                                                       
## [27] "Apple iMac 21.5in 2.7GHz Core i5 (ME086LL/A) All In One Desktop"                                                                                                                                    
## [28] "HP ProDesk Desktop RGB Lights Computer Intel Core i5 4570 3.2 GHz 8GB RAM 256GB SSD Win 10 Pro WiFi"                                                                                                
## [29] "HP ProDesk 600 G1 SFF Slim Business Desktop Computer"                                                                                                                                               
## [30] "2020 Apple iMac with Retina 5K Display (27-inch"

Extracting computer prices

top_computers %>%
  html_nodes(".a-link-normal span") %>%
  html_text() %>%
  str_extract(pattern = "^\\$.*") %>%
  na.omit() %>%
  unique() -> Price
Price
##  [1] "$257.00"   "$849.99"   "$266.11"   "$139.00"   "$769.99"   "$599.99"  
##  [7] "$548.00"   "$134.50"   "$205.00"   "$1,449.99" "$529.00"   "$1,269.99"
## [13] "$1,599.99" "$138.00"   "$159.00"   "$132.99"   "$199.94"   "$1,099.99"
## [19] "$337.99"   "$250.00"   "$565.61"   "$226.69"   "$137.99"   "$130.99"  
## [25] "$899.99"   "$135.00"   "$279.99"   "$134.99"   "$89.99"    "$1,195.55"

Extracting computer ratings

top_computers %>%
  html_nodes(".a-icon-alt") %>%
  html_text() %>%
  str_split('out') %>%
  map_chr(1) %>%
  str_trim()-> Rating
Rating
##  [1] "4.1" "4.6" "4.3" "4.0" "4.6" "4.4" "4.3" "4.3" "4.4" "3.7" "4.3" "4.2"
## [13] "4.7" "4.4" "4.8" "4.2" "4.4" "4.4" "4.1" "4.2" "4.5" "3.7" "4.3" "4.0"
## [25] "4.3" "4.2" "4.3" "4.0" "4.3" "4.7"

Merging data frames into one data frame named Amazon_Computer_Best_Sellers

Amazon_Computer_Best_Sellers <- tibble(Brand_name, Price, Rating)
Amazon_Computer_Best_Sellers
## # A tibble: 30 x 3
##    Brand_name                                                       Price Rating
##    <chr>                                                            <chr> <chr> 
##  1 Dell OptiPlex Computer Desktop PC                                $257~ 4.1   
##  2 CYBERPOWERPC Gamer Master Gaming PC                              $849~ 4.6   
##  3 Dell Optiplex 9020 Small Form Factor Desktop with Intel Core i7~ $266~ 4.3   
##  4 HP Elite Desktop PC Computer Intel Core i5 3.1-GHz               $139~ 4.0   
##  5 CYBERPOWERPC Gamer Xtreme VR Gaming PC                           $769~ 4.6   
##  6 SkyTech Chronos Mini Gaming Computer PC Desktop - Intel Core-i3~ $599~ 4.4   
##  7 Dell Inspiron 3910 Desktop Computer Tower - 12th Gen Intel Core~ $548~ 4.3   
##  8 HP EliteDesk 800 G1 SFF High Performance Business Desktop Compu~ $134~ 4.3   
##  9 BOMIX 2023 Newest H8 HTV8                                        $205~ 4.4   
## 10 ROG Strix G10 Gaming Desktop PC                                  $1,4~ 3.7   
## # ... with 20 more rows