Skip to content

lpil/presentable-soup

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

21 Commits
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

Presentable Soup

Efficient querying, scraping, and parsing of HTML. Good for snapshot testing too!

Package Version Hex Docs

This package supports the Gleam Erlang target.

gleam add presentable_soup@2
import gleam/list
import gleam/result
import gleam/string
import presentable_soup as soup

pub fn main() {
  // You've got some HTML. Maybe this is downloaded from a website, or it's
  // generated in your tests. Anything is fine.
  let document =
    "
<!doctype html>
<head>
  <title>Presentable Soup Webpage</title>
</head>
<body>
  <h1 id=\"title\">Presentable Soup</h1>
  <p>Is it good? Yes I think it might be!</p>
  <aside>
    <p>Low memory use even for large documents.</p>
  </aside>
</body>
</html>
"

  // Use `element` to start a query for the first element matching all the
  // given matchers, and `scrape` to run it on some HTML.
  let scraped =
    soup.element([soup.with_tag("h1"), soup.with_id("title")])
    |> soup.return(soup.text_content())
    |> soup.scrape(document)
  assert scraped == Ok(["Presentable Soup"])

  // Different scrapers can be use with `return` to extract different data
  // from the queried element.
  let scraped =
    soup.element([soup.with_tag("h1")])
    |> soup.return(soup.attributes())
    |> soup.scrape(document)
  assert scraped == Ok([#("id", "title")])

  // Use `elements` to scrape multiple matching elements.
  let scraped =
    soup.elements([soup.with_tag("p")])
    |> soup.return(soup.text_content())
    |> soup.scrape(document)
  assert scraped
    == Ok([
      ["Is it good? Yes I think it might be!"],
      ["Low memory use even for large documents."],
    ])

  // The `descendant` function can be used to make a more complex query that
  // matches elements within some other element.
  // This query matches any `p` element that is within an `aside` element.
  let scraped =
    soup.element([soup.with_tag("aside")])
    |> soup.descendant([soup.with_tag("p")])
    |> soup.return(soup.text_content())
    |> soup.scrape(document)
  assert scraped == Ok(["Low memory use even for large documents."])

  // Often we need to extract multiple things from one element.
  // To do this we can combine multiple scrapers into one:
  let id_and_text = {
    use attrs, text <- soup.merge2(soup.attributes(), soup.text_content())
    let id = list.key_find(attrs, "id") |> result.unwrap("<no id>")
    "#" <> id <> ": " <> string.join(text, "\n")
  }
  let scraped =
    soup.element([soup.with_tag("h1")])
    |> soup.return(id_and_text)
    |> soup.scrape(document)
  assert scraped == Ok("title: Presentable Soup")

  // More complex scrapers can be combined to get data from multiple
  // elements within a query.
  let document =
    "
<div class='pokemon' data-type='grass'>
  <title>Bulbasaur</title>
  A chill leafy guy.
</div>
<div class='pokemon' data-type='fire'>
  <title>Charmander</title>
  Creates steam when it rains.
</div>
<div class='pokemon' data-type='water'>
  <title>Squirtle</title>
  Looks rad in sunglasses.
</div>
"
  let pokemon_name =
    soup.element([soup.with_tag("title")])
    |> soup.return(soup.text_content())
    |> soup.map(string.concat)
  let pokemon_type =
    soup.attributes()
    |> soup.try_map(list.key_find(_, "type"))
  let pokemon = {
    use name, type_ <- soup.merge2(pokemon_name, pokemon_type)
    Pokemon(name:, type_:)
  }
  let scraped =
    soup.elements([soup.with_class("pokemon")])
    |> soup.return(pokemon)
    |> soup.scrape(document)
  assert scraped
    == Ok([
      Pokemon(name: "Bulbasaur", type_: "grass"),
      Pokemon(name: "Charmander", type_: "fire"),
      Pokemon(name: "Squirtle", type_: "water"),
    ])
}

pub type Pokemon {
  Pokemon(name: String, type_: String)
}

// The returned elements can be rendered as HTML. This is especially useful
// for snapshot testing!
// Don't test your generated HTML by looking for sub-strings, instead query
// for the parts of the page that matter for each test and then snapshot it
// with a library like Giacomo Cavalieri's Birdie.
pub fn contact_page_test() {
  let webpage = my_app.handle_request("/contact")

  // Query the page. In this test I want to focus on the contact form.
  let assert Ok(found) =
    soup.elements([soup.with_tag("form"), soup.with_class("contact-form")])
    |> soup.return(soup.element_tree())
    |> soup.scrape(webpage)

  // Render the matched HTML, create a descriptive snapshot string, and
  // snapshot it!
  let snapshot =
    "Contact page `form` with class `contact-form`\n\n"
    <> soup.elements_to_string(found)
  birdie.snap("contact page form", snapshot)
}

Further documentation can be found at https://hexdocs.pm/presentable_soup.

Thanks

A huge thank you to Zachary Dean for making htmerl, the excellent streaming HTML parser this package uses.

About

Querying, scraping, and parsing of HTML. Good for snapshot testing too!

Topics

Resources

Stars

Watchers

Forks

Contributors