Marrut,
Primeiramente, eu não gosto de usar pacotes que tentam automatizar a criação de scrapers como o {ralger}
. Normalmente eles acabam criando problemas como o que você está descrevendo, juntando coisas em colunas e linhas que não fazem sentido quando pensamos nos dados. A minha recomendação é usar pacotes mais “básicos”, mas que dão controle total ao usuário; eu particularmente recomendo o {xml2}
e o {httr}
.
Normalmente nós não resolvemos o problema do aluno por completo, porque não queremos dar o peixe pronto. Neste caso, entretanto, seria muita sacanagem falar para você usar outro pacote e deixar por isso mesmo… No final, achei a raspagem desse site muito legal para não fazer um scraper por conta própria Abaixo fica a minha solução para raspar um evento do World Surf League:
library(magrittr)
# Parse one athlete node
# @param athlete Node containing athlete information
# @return A data frame with 1 row and 9 columns
athlete_parse <- function(athlete) {
# Athlete place
athlete_place <- athlete %>%
xml2::xml_find_first(".//span[@class='new-heat-athlete-place']") %>%
xml2::xml_text() %>%
base::as.integer()
# Athlete image
athlete_image <- athlete %>%
xml2::xml_find_first(".//div[@class='avatar-image']/a") %>%
xml2::xml_attr("href")
# Athlete name
athlete_name <- athlete %>%
xml2::xml_find_first(".//span[@class='athlete-name']") %>%
xml2::xml_text()
# Athlete country
athlete_country <- athlete %>%
xml2::xml_find_first(".//span[@class='athlete-country-flag']") %>%
xml2::xml_attr("title")
# Helper function to handle missing nodes
safe_xml_text <- purrr::possibly(xml2::xml_text, NA)
# Athlete first wave
athlete_wave1 <- athlete %>%
xml2::xml_find_first(".//span[contains(@class, 'wave-1')]") %>%
xml2::xml_children() %>%
purrr::pluck(2) %>%
safe_xml_text() %>%
base::as.numeric()
# Athlete second wave
athlete_wave2 <- athlete %>%
xml2::xml_find_first(".//span[contains(@class, 'wave-2')]") %>%
xml2::xml_children() %>%
purrr::pluck(2) %>%
safe_xml_text() %>%
base::as.numeric()
# Athlete total score
athlete_score <- athlete %>%
xml2::xml_find_first(".//span[contains(@class, 'wave-total')]") %>%
xml2::xml_children() %>%
purrr::map(xml2::xml_text) %>%
utils::tail(-1) %>%
purrr::set_names("total", "needs") %>%
purrr::map(stringr::str_extract, "[0-9.]+$") %>%
purrr::map(as.numeric)
# Athlete waves (all)
athlete_all_waves <- athlete %>%
xml2::xml_find_first(".//div[contains(@class, 'all-waves')]") %>%
xml2::xml_children() %>%
purrr::map(~ list(
number = xml2::xml_find_first(.x, ".//span[@class='wave-number']"),
score = xml2::xml_find_first(.x, ".//span[@class='score']")
)) %>%
purrr::map(purrr::map, xml2::xml_text) %>%
purrr::transpose() %>%
dplyr::as_tibble() %>%
tidyr::unnest(dplyr::everything()) %>%
dplyr::transmute(
wave_number = as.integer(number),
wave_score = stringr::str_squish(score),
wave_score = stringr::str_extract(wave_score, "^[0-9.]+$"),
wave_score = as.numeric(wave_score)
)
# Athlete data frame
dplyr::tibble(
athlete_place = athlete_place,
athlete_name = athlete_name,
athlete_country = athlete_country,
athlete_image = athlete_image,
athlete_wave1 = athlete_wave1,
athlete_wave2 = athlete_wave2,
athlete_total = athlete_score$total,
athlete_needs = athlete_score$needs,
athlete_waves = list(athlete_all_waves)
)
}
# Parse one heat node
# @param heat Node containing heat information
# @return A data frame with 1 row and 4 columns
heat_parse <- function(heat) {
# Heat name
heat_name <- heat %>%
xml2::xml_find_first(".//span[@class='new-heat-hd-name']") %>%
xml2::xml_contents() %>%
purrr::pluck(1) %>%
xml2::xml_text() %>%
stringr::str_squish()
# Heat average wave score
heat_aws <- heat %>%
xml2::xml_find_first(".//span[@class='new-heat-hd-status']") %>%
xml2::xml_contents() %>%
purrr::pluck(1) %>%
xml2::xml_text() %>%
stringr::str_squish() %>%
base::as.numeric()
# Parse all athletes
athletes <- heat %>%
xml2::xml_find_all(".//div[@class='bd new-heat-bd']/div") %>%
purrr::map_dfr(athlete_parse)
# Heat data frame
dplyr::tibble(
heat_number = as.numeric(stringr::str_extract(heat_name, "[0-9]+$")),
heat_name = heat_name,
heat_aws = heat_aws,
heat_athletes = list(athletes)
)
}
# Parse one round node
# @param round Node containing round information
# @return A data frame with 1 row and 3 columns
round_parse <- function(round) {
# Round number
round_number <- round %>%
xml2::xml_find_all("//li[contains(@class, 'is-selected')]") %>%
xml2::xml_attr("data-item-index") %>%
base::as.integer() %>%
magrittr::add(1L)
# Round name
round_name <- round %>%
xml2::xml_find_all("//span[@class='carousel-item-title']") %>%
purrr::pluck(round_number) %>%
xml2::xml_text()
# Parse all heats
heats <- round %>%
xml2::xml_find_all("//div[contains(@id, 'heat-')]") %>%
purrr::map_dfr(heat_parse)
# Round data frame
dplyr::tibble(
round_number = round_number,
round_name = round_name,
round_heats = list(heats)
)
}
# Parse one event
# @param event String with URL of event
# @return A data frame with 1 row and 4 columns
event_parse <- function(event) {
# Download event HTML
event <- xml2::read_html(httr::GET(event))
# Event information
event_info <- event %>%
xml2::xml_find_first("//span[@class='event-meta-tour-info']") %>%
xml2::xml_contents() %>%
xml2::xml_text() %>%
magrittr::extract(. != "") %>%
stringr::str_c(collapse = "|") %>%
stringr::str_squish()
# Event name
event_name <- event %>%
xml2::xml_find_first("//div[@class='event-title']/h1") %>%
xml2::xml_text()
# Event schedule
event_schedule <- event %>%
xml2::xml_find_first("//div[@class='event-schedule']") %>%
xml2::xml_text() %>%
stringr::str_squish()
# Base URL for rounds
url <- "https://www.worldsurfleague.com"
# Parse all rounds
rounds <- event %>%
xml2::xml_find_all("//li[contains(@class, 'carousel-item')]") %>%
xml2::xml_attr("data-item-href") %>%
stringr::str_c(url, .) %>%
purrr::map(httr::GET) %>%
purrr::map(xml2::read_html) %>%
purrr::map_dfr(round_parse)
# Event data frame
dplyr::tibble(
event_name = event_name,
event_info = event_info,
event_schedule = event_schedule,
event_rounds = list(rounds)
)
}
# URL to event
event <- stringr::str_c(
"https://www.worldsurfleague.com/",
"events/2021/mct/3616/",
"billabong-pipe-masters-presented-by-hydro-flask"
)
# Parse event and unnest each level
event %>%
event_parse() %>%
tidyr::unnest(event_rounds) %>%
tidyr::unnest(round_heats) %>%
tidyr::unnest(heat_athletes) %>%
tidyr::unnest(athlete_waves) %>%
dplyr::glimpse(width = 70)
#> Rows: 1,650
#> Columns: 18
#> $ event_name <chr> "Billabong Pipe Masters presented by Hydro …
#> $ event_info <chr> "Men's CT #1 | Banzai Pipeline, Oahu, Hawai…
#> $ event_schedule <chr> "Dec 8 - 20, 2020", "Dec 8 - 20, 2020", "De…
#> $ round_number <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
#> $ round_name <chr> "Seeding Round", "Seeding Round", "Seeding …
#> $ heat_number <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
#> $ heat_name <chr> "Heat 1", "Heat 1", "Heat 1", "Heat 1", "He…
#> $ heat_aws <dbl> 2.71, 2.71, 2.71, 2.71, 2.71, 2.71, 2.71, 2…
#> $ athlete_place <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
#> $ athlete_name <chr> "Kanoa Igarashi", "Kanoa Igarashi", "Kanoa …
#> $ athlete_country <chr> "Japan", "Japan", "Japan", "Japan", "Japan"…
#> $ athlete_image <chr> "https://www.worldsurfleague.com/athletes/3…
#> $ athlete_wave1 <dbl> 5.43, 5.43, 5.43, 5.43, 5.43, 5.43, 5.43, 5…
#> $ athlete_wave2 <dbl> 4.87, 4.87, 4.87, 4.87, 4.87, 4.87, 4.87, 4…
#> $ athlete_total <dbl> 10.30, 10.30, 10.30, 10.30, 10.30, 10.30, 1…
#> $ athlete_needs <dbl> 5.26, 5.26, 5.26, 5.26, 5.26, 5.26, 5.26, 5…
#> $ wave_number <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, …
#> $ wave_score <dbl> 0.40, 2.67, 5.43, 4.87, 0.53, NA, NA, NA, N…
Created on 2021-02-04 by the reprex package (v1.0.0)
Espero que o código acima atenda às suas necessidades. Caso contrário, acho que até está fácil adaptá-lo (apesar de não ser meu scraper mais organizado). Note que eu uso a função tidyr::unnest()
algumas vezes no final! Ela é essencial para raspadores como esse, ou seja, nos quais precisamos iterar em muitas listas com itens do mesmo tipo.