Skip to contents

get_link_from_html() returns all links present in a html page.

get_date_from_html() returns links present in a html page that correspond to dates.

Usage

get_link_from_html(url = NULL, x = NULL)

get_date_from_html(
  url = NULL,
  x = NULL,
  expected = c("month", "day", "year", "any"),
  last = FALSE
)

Arguments

url

character, address of the url.

x

character vector, of url is missing, gives directly a character vector that could have been the result from read_url(url).

expected

character, what type of dates are expected between month, day, or year.

last

logical, should a "last" date be looked for.

Value

character of links founded. NULL if nothing has been found.

Examples

x = c(
    "<html>",
    "<head><title>Index of /example/</title></head>",
    "<body>",
    "<h1>Index of /example/</h1><hr><pre>",
    "<a href='../'>../</a>",
    "<a href='parent/2023-11/archive.csv.gz'>archive.csv.gz</a>",
    "<a href='2023-12/archive.csv.gz'>archive.csv.gz</a>",
    "<a href=\"2024-01/\">2024-01/</a>              07-Jan-2024 09:54",
    "<a href=\"parent/child/2024-02/\">2024-02/</a> 07-Feb-2024 09:54",
    "<a href=\"2024-03\">2024-03</a>                07-Mar-2024 10:02",
    "<a href=\"2024-01-01/\">2024-01-01/</a>        02-Jan-2024 09:56",
    "<a href=\"2024-01-02\">2024-01-02</a>          03-jan-2024 11:02",
    "<a href=\"2023/\">2023/</a>                    01-Feb-2023 15:54",
    "<a href=\"2024\">2024</a>                      01-Feb-2024 15:54",
    "<a href=\"last/\">last/</a>                    07-Mar-2024 10:02",
    "</pre><hr></body>",
    "</html>"
)
get_link_from_html(x = x)
#>  [1] "../"                           "parent/2023-11/archive.csv.gz"
#>  [3] "2023-12/archive.csv.gz"        "2024-01/"                     
#>  [5] "parent/child/2024-02/"         "2024-03"                      
#>  [7] "2024-01-01/"                   "2024-01-02"                   
#>  [9] "2023/"                         "2024"                         
#> [11] "last/"                        

get_date_from_html(x = x)
#> [1] "2024-01" "2024-02" "2024-03"
get_date_from_html(x = x, expected = "day")
#> [1] "2024-01-01" "2024-01-02"
get_date_from_html(x = x, expected = "year")
#> [1] "2023" "2024"
get_date_from_html(x = x, expected = "any")
#> [1] "2024-01"    "2024-02"    "2024-03"    "2024-01-01" "2024-01-02"
#> [6] "2023"       "2024"      
get_date_from_html(x = x[-(8:10)])
#> character(0)
get_date_from_html(x = x[-(8:10)], last = TRUE)
#> Warning: Only `last` has been found. Possible mismatch with expected format (month)
#> [1] "last"