[Webscrapping] - Solar spectrum calculator

Olá pessoal!

Eu precisava fazer o scrapping desse site para utilizar na minha tese.
A ideia seria modificar os valores de data e coordenadas e pegar os outputs gerados.

Eu até comecei, mas não tenho muita experiência com webscrapping e POST

library(httr)

url <- "https://www2.pvlighthouse.com.au/calculators/solar%20spectrum%20calculator/solar%20spectrum%20calculator.aspx"

page_initial <- GET(url)

content_initial <- httr::content(page_initial)

data_form <- list(
    "smMain"= "TabContainer1$TabPanel1$UpdatePanel1|TabContainer1$TabPanel1$tbYear",
    "TabContainer1$TabPanel1$tbYear"= "2023",
    "TabContainer1$TabPanel1$tbHour"= "12",
    "TabContainer1$TabPanel1$tbMonth"= "1",
    "TabContainer1$TabPanel1$tbMinute"= "0",
    "TabContainer1$TabPanel1$tbDay"= "19",
    "TabContainer1$TabPanel1$tbSecond"= "0",
    "TabContainer1$TabPanel1$tbLatitude"= "-34,2833",
    "TabContainer1$TabPanel1$tbLongitude"= "150,95",
    "TabContainer1$TabPanel1$tbModuleTiltAngle"= "0",
    "TabContainer1$TabPanel1$tbModuleAzimuthAngle"= "0",
    "TabContainer1$TabPanel1$ddlSpectrumSource"= "AM0",
    "TabContainer1$TabPanel1$ddlAtmosphericTransmissionModel"= "SPCTRAL2 [Bir86]",
    "TabContainer1$TabPanel1$tbAtmosphericPressure"= "1013,25",
    "TabContainer1$TabPanel1$tbTurbidity"= "0,084",
    "TabContainer1$TabPanel1$tbPrecipitableWaterVapour"= "1,4164",
    "TabContainer1$TabPanel1$tbOzone"= "0,3438",
    "TabContainer1$TabPanel1$tbAlbedo"= "0,1",
    "TabContainer1$TabPanel1$cpeInsolation_ClientState"= "false",
    "TabContainer1$TabPanel1$cpeFigure_ClientState"= "false",
    "TabContainer1$TabPanel1$ddlFigureYaxis"= "Spectral irradiance",
    "TabContainer1$TabPanel1$ddlSelectedPlane"= "Module",
    "TabContainer1$TabPanel1$ddlFigureXaxis"= "Wavelength",
    "TabContainer1$TabPanel1$cbPlotExtraterrestrialSpectrum"= "on",
    "TabContainer1$TabPanel1$cbPlotDirect"= "on",
    "TabContainer1$TabPanel1$cbPlotDiffuse"= "on",
    "TabContainer1$TabPanel1$cbPlotGlobal"= "on",
    "TabContainer1$tabOptions$ddlSolarPosition"= "Enter module location and time",
    "TabContainer1$tabOptions$ddlSolarVectorAlgorithm"= "PSA algorithm [Bla01]",
    "TabContainer1$tabOptions$ddlSpectrumManipulation"= "Impose wavelength limits",
    "TabContainer1$tabOptions$tbOptionWavelengthMin"= "280",
    "TabContainer1$tabOptions$tbOptionWavelengthMax"= "4000",
    "TabContainer1$tabOptions$tbOptionWavelengthInterval"= "10",
    "TabContainer1$TabPanel2$ddlOptionsFileType"= "CSV US/UK (comma delimited)",
    "TabContainer1$TabPanel2$tbStoreXData"= "",
    "TabContainer1$TabPanel2$tbStoreYData"= "",
    "tbFeedback"= ""
)

search_page <- POST(
  url = url,
  body = data_form,
  encode = "form"
)

search_page$content
#>     [1] 0d 0a 0d 0a 3c 21 44 4f 43 54 59 50 45 20 68 74 6d 6c 3e 0d 0a 0d 0a 3c
#>    [25] 68 74 6d 6c 3e 0d 0a 3c 68 65 61 64 3e 3c 74 69 74 6c 65 3e 0d 0a 09 53
#>    [49] 6f 6c 61 72 20 73 70 65 63 74 72 75 6d 20 63 61 6c 63 75 6c 61 74 6f 72
#>    [73] 0d 0a 3c 2f 74 69 74 6c 65 3e 3c 6d 65 74 61 20 69 64 3d 22 4d 65 74 61
#>    [97] 31 22 20 6e 61 6d 65 3d 22 64 65 73 63 72 69 70 74 69 6f 6e 22 20 63 6f
#>  [ reached getOption("max.print") -- omitted 299085 entries ]

Created on 2023-01-19 with reprex v2.0.2

1 curtida

Bruno,

Na sua requisição ficou faltando o __VIEWSTATE e o __EVENTVALIDATION. Veja se você consegue usar o código abaixo como ponto de partida para a sua extração de dados:

# Acesso para pegar as informações da sessão
res <- httr::GET("https://www2.pvlighthouse.com.au/calculators/solar%20spectrum%20calculator/solar%20spectrum%20calculator.aspx")

# Pegar o __VIEWSTATE
viewstate <- res |>
  httr::content() |>
  xml2::xml_find_first("//input[@id='__VIEWSTATE']") |>
  xml2::xml_attr("value")

# Pegar o __EVENTVALIDATION
eventvalidation <- res |>
  httr::content() |>
  xml2::xml_find_first("//input[@id='__EVENTVALIDATION']") |>
  xml2::xml_attr("value")

# Declarar parâmetros
params <- list(
  "smMain" = "TabContainer1$TabPanel1$UpdatePanel1|TabContainer1$TabPanel1$tbDay",
  "TabContainer1_ClientState" = "{\"ActiveTabIndex\":0,\"TabEnabledState\":[true,true,true,true],\"TabWasLoadedOnceState\":[true,false,false,false]}",
  "__EVENTTARGET" = "TabContainer1$TabPanel1$tbDay",
  "__EVENTARGUMENT" = "",
  "__LASTFOCUS" = "",
  "__VIEWSTATE" = viewstate,
  "__VIEWSTATEGENERATOR" = "2AB02EE8",
  "__SCROLLPOSITIONX" = "0",
  "__SCROLLPOSITIONY" = "0",
  "__EVENTVALIDATION" = eventvalidation,
  "TabContainer1$TabPanel1$tbYear" = "2023",
  "TabContainer1$TabPanel1$tbHour" = "12",
  "TabContainer1$TabPanel1$tbMonth" = "1",
  "TabContainer1$TabPanel1$tbMinute" = "0",
  "TabContainer1$TabPanel1$tbDay" = "18",
  "TabContainer1$TabPanel1$tbSecond" = "0",
  "TabContainer1$TabPanel1$tbLatitude" = "-34.2833",
  "TabContainer1$TabPanel1$tbLongitude" = "150.95",
  "TabContainer1$TabPanel1$tbModuleTiltAngle" = "0",
  "TabContainer1$TabPanel1$tbModuleAzimuthAngle" = "0",
  "TabContainer1$TabPanel1$ddlSpectrumSource" = "AM0",
  "TabContainer1$TabPanel1$ddlAtmosphericTransmissionModel" = "SPCTRAL2 [Bir86]",
  "TabContainer1$TabPanel1$tbAtmosphericPressure" = "1013.25",
  "TabContainer1$TabPanel1$tbTurbidity" = "0.084",
  "TabContainer1$TabPanel1$tbPrecipitableWaterVapour" = "1.4164",
  "TabContainer1$TabPanel1$tbOzone" = "0.3438",
  "TabContainer1$TabPanel1$tbAlbedo" = "0.1",
  "TabContainer1$TabPanel1$cpeInsolation_ClientState" = "false",
  "TabContainer1$TabPanel1$cpeFigure_ClientState" = "false",
  "TabContainer1$TabPanel1$ddlFigureYaxis" = "Spectral irradiance",
  "TabContainer1$TabPanel1$ddlSelectedPlane" = "Module",
  "TabContainer1$TabPanel1$ddlFigureXaxis" = "Wavelength",
  "TabContainer1$TabPanel1$cbPlotExtraterrestrialSpectrum" = "on",
  "TabContainer1$TabPanel1$cbPlotDirect" = "on",
  "TabContainer1$TabPanel1$cbPlotDiffuse" = "on",
  "TabContainer1$TabPanel1$cbPlotGlobal" = "on",
  "TabContainer1$tabOptions$ddlSolarPosition" = "Enter module location and time",
  "TabContainer1$tabOptions$ddlSolarVectorAlgorithm" = "PSA algorithm [Bla01]",
  "TabContainer1$tabOptions$ddlSpectrumManipulation" = "Impose wavelength limits",
  "TabContainer1$tabOptions$tbOptionWavelengthMin" = "280",
  "TabContainer1$tabOptions$tbOptionWavelengthMax" = "4000",
  "TabContainer1$tabOptions$tbOptionWavelengthInterval" = "10",
  "TabContainer1$TabPanel2$ddlOptionsFileType" = "CSV US/UK (comma delimited)",
  "TabContainer1$TabPanel2$tbStoreXData" = "",
  "TabContainer1$TabPanel2$tbStoreYData" = "",
  "tbFeedback" = "",
  "__ASYNCPOST" = "false"
)

"https://www2.pvlighthouse.com.au/calculators/solar%20spectrum%20calculator/solar%20spectrum%20calculator.aspx" |>
  httr::POST(body = params, encode = "form") |>
  httr::content() |>
  xml2::xml_find_all("//div[@id='TabContainer1_TabPanel1_panInsolation']//table") |>
  rvest::html_table()
#> [[1]]
#> # A tibble: 7 × 3
#>   X1               X2               X3              
#>   <chr>            <chr>            <chr>           
#> 1 ""               ""               ""              
#> 2 "Solar position" "Solar position" "Solar position"
#> 3 "Air mass"       "1.03"            <NA>           
#> 4 "Zenith angle"   "13.88"          "°"             
#> 5 "Azimuth angle"  "10.03"          "°"             
#> 6 "Incident angle" "13.88"          "°"             
#> 7 ""               ""               ""              
#> 
#> [[2]]
#> # A tibble: 10 × 6
#>    X1                                              X2    X3    X4    X5    X6   
#>    <chr>                                           <chr> <chr> <chr> <chr> <chr>
#>  1 ""                                              ""    ""    ""    ""    ""   
#>  2 ""                                              "Pow… "Pow… ""    "Pho… "Pho…
#>  3 ""                                              "per… "mod… ""    "per… "mod…
#>  4 ""                                              ""    ""    ""    ""    ""   
#>  5 "Direct"                                        "101… "989… ""    "70.… "68.…
#>  6 "Diffuse"                                       "105… "103… ""    "4.8… "4.7…
#>  7 ""                                              ""    ""    ""    ""    ""   
#>  8 "Global"                                        "112… "109… ""    "75.… "73.…
#>  9 ""                                              ""    ""    ""    ""    ""   
#> 10 "Integrated over the wavelength range 280–4000… "Int… "Int… "Int… "Int… "Int…

Created on 2023-01-19 with reprex v2.0.2

Caraca, muito bom!
Se não for pedir muito, eu queria extrair os dados da seção
<!------------------------------ FIGURE ------------------------->
que vem na resposta também, são os dados que geram o gráfico. São esses dados:

image

Isso serve?

# Acesso para pegar as informações da sessão
res <- httr::GET("https://www2.pvlighthouse.com.au/calculators/solar%20spectrum%20calculator/solar%20spectrum%20calculator.aspx")

# Pegar o __VIEWSTATE
viewstate <- res |>
  httr::content() |>
  xml2::xml_find_first("//input[@id='__VIEWSTATE']") |>
  xml2::xml_attr("value")

# Pegar o __EVENTVALIDATION
eventvalidation <- res |>
  httr::content() |>
  xml2::xml_find_first("//input[@id='__EVENTVALIDATION']") |>
  xml2::xml_attr("value")

# Declarar parâmetros
params <- list(
  "smMain" = "TabContainer1$TabPanel1$UpdatePanel1|TabContainer1$TabPanel1$tbDay",
  "TabContainer1_ClientState" = "{\"ActiveTabIndex\":0,\"TabEnabledState\":[true,true,true,true],\"TabWasLoadedOnceState\":[true,false,false,false]}",
  "__EVENTTARGET" = "TabContainer1$TabPanel1$tbDay",
  "__EVENTARGUMENT" = "",
  "__LASTFOCUS" = "",
  "__VIEWSTATE" = viewstate,
  "__VIEWSTATEGENERATOR" = "2AB02EE8",
  "__SCROLLPOSITIONX" = "0",
  "__SCROLLPOSITIONY" = "0",
  "__EVENTVALIDATION" = eventvalidation,
  "TabContainer1$TabPanel1$tbYear" = "2023",
  "TabContainer1$TabPanel1$tbHour" = "12",
  "TabContainer1$TabPanel1$tbMonth" = "1",
  "TabContainer1$TabPanel1$tbMinute" = "0",
  "TabContainer1$TabPanel1$tbDay" = "18",
  "TabContainer1$TabPanel1$tbSecond" = "0",
  "TabContainer1$TabPanel1$tbLatitude" = "-34.2833",
  "TabContainer1$TabPanel1$tbLongitude" = "150.95",
  "TabContainer1$TabPanel1$tbModuleTiltAngle" = "0",
  "TabContainer1$TabPanel1$tbModuleAzimuthAngle" = "0",
  "TabContainer1$TabPanel1$ddlSpectrumSource" = "AM0",
  "TabContainer1$TabPanel1$ddlAtmosphericTransmissionModel" = "SPCTRAL2 [Bir86]",
  "TabContainer1$TabPanel1$tbAtmosphericPressure" = "1013.25",
  "TabContainer1$TabPanel1$tbTurbidity" = "0.084",
  "TabContainer1$TabPanel1$tbPrecipitableWaterVapour" = "1.4164",
  "TabContainer1$TabPanel1$tbOzone" = "0.3438",
  "TabContainer1$TabPanel1$tbAlbedo" = "0.1",
  "TabContainer1$TabPanel1$cpeInsolation_ClientState" = "false",
  "TabContainer1$TabPanel1$cpeFigure_ClientState" = "false",
  "TabContainer1$TabPanel1$ddlFigureYaxis" = "Spectral irradiance",
  "TabContainer1$TabPanel1$ddlSelectedPlane" = "Module",
  "TabContainer1$TabPanel1$ddlFigureXaxis" = "Wavelength",
  "TabContainer1$TabPanel1$cbPlotExtraterrestrialSpectrum" = "on",
  "TabContainer1$TabPanel1$cbPlotDirect" = "on",
  "TabContainer1$TabPanel1$cbPlotDiffuse" = "on",
  "TabContainer1$TabPanel1$cbPlotGlobal" = "on",
  "TabContainer1$tabOptions$ddlSolarPosition" = "Enter module location and time",
  "TabContainer1$tabOptions$ddlSolarVectorAlgorithm" = "PSA algorithm [Bla01]",
  "TabContainer1$tabOptions$ddlSpectrumManipulation" = "Impose wavelength limits",
  "TabContainer1$tabOptions$tbOptionWavelengthMin" = "280",
  "TabContainer1$tabOptions$tbOptionWavelengthMax" = "4000",
  "TabContainer1$tabOptions$tbOptionWavelengthInterval" = "10",
  "TabContainer1$TabPanel2$ddlOptionsFileType" = "CSV US/UK (comma delimited)",
  "TabContainer1$TabPanel2$tbStoreXData" = "",
  "TabContainer1$TabPanel2$tbStoreYData" = "",
  "tbFeedback" = "",
  "__ASYNCPOST" = "false"
)

# Pegar o conteúdo da resposta
cont <- "https://www2.pvlighthouse.com.au/calculators/solar%20spectrum%20calculator/solar%20spectrum%20calculator.aspx" |>
  httr::POST(body = params, encode = "form") |>
  httr::content()

# Extrair dados
cont |>
  xml2::xml_find_all("//map/area") |>
  purrr::map(\(x) list(coords = xml2::xml_attr(x, "coords"), title = xml2::xml_attr(x, "title"))) |>
  purrr::transpose() |>
  purrr::map(purrr::flatten_chr) |>
  tibble::as_tibble()
#> # A tibble: 1,494 × 2
#>    coords                                                          title        
#>    <chr>                                                           <chr>        
#>  1 53,335,391,335,391,374,53,374                                   "Click to co…
#>  2 28,105,64,105,64,346,28,346                                     "Click to co…
#>  3 383,333,383,333,383,341,383,341                                 "WL = 4000.0…
#>  4 382,333,382,333,382,341,382,341,382,333,383,333,383,341,382,341 "WL = 3990.0…
#>  5 381,333,382,333,382,341,381,341,382,333,382,333,382,341,382,341 "WL = 3980.0…
#>  6 380,333,381,333,381,341,380,341,381,333,381,333,381,341,381,341 "WL = 3970.0…
#>  7 379,333,380,333,380,341,379,341,380,333,380,333,380,341,380,341 "WL = 3960.0…
#>  8 379,333,379,333,379,341,379,341,379,333,379,333,379,341,379,341 "WL = 3950.0…
#>  9 378,333,378,333,378,341,378,341,378,333,379,333,379,341,378,341 "WL = 3940.0…
#> 10 377,333,377,333,377,341,377,341,377,333,378,333,378,341,377,341 "WL = 3930.0…
#> # … with 1,484 more rows

# Extrair imagem
cont |>
  xml2::xml_find_all("//img[@id='TabContainer1_TabPanel1_Chart1']") |>
  xml2::xml_attr("src") |>
  stringr::str_c("https://www2.pvlighthouse.com.au", ... = _) |>
  httr::GET() |>
  httr::content() |>
  magick::image_read()

Created on 2023-01-19 with reprex v2.0.2

Nossa, perfeito @clente !
Muito obrigado!!