Presentable Soup

Efficient querying, scraping, and parsing of HTML. Good for snapshot testing too!

This package supports the Gleam Erlang target.

gleam add presentable_soup@2

import gleam/list
import gleam/result
import gleam/string
import presentable_soup as soup

pub fn main() {
  // You've got some HTML. Maybe this is downloaded from a website, or it's
  // generated in your tests. Anything is fine.
  let document =
    "
<!doctype html>
<head>
  <title>Presentable Soup Webpage</title>
</head>
<body>
  <h1 id=\"title\">Presentable Soup</h1>
  <p>Is it good? Yes I think it might be!</p>
  <aside>
    <p>Low memory use even for large documents.</p>
  </aside>
</body>
</html>
"

  // Use `element` to start a query for the first element matching all the
  // given matchers, and `scrape` to run it on some HTML.
  let scraped =
    soup.element([soup.with_tag("h1"), soup.with_id("title")])
    |> soup.return(soup.text_content())
    |> soup.scrape(document)
  assert scraped == Ok(["Presentable Soup"])

  // Different scrapers can be use with `return` to extract different data
  // from the queried element.
  let scraped =
    soup.element([soup.with_tag("h1")])
    |> soup.return(soup.attributes())
    |> soup.scrape(document)
  assert scraped == Ok([#("id", "title")])

  // Use `elements` to scrape multiple matching elements.
  let scraped =
    soup.elements([soup.with_tag("p")])
    |> soup.return(soup.text_content())
    |> soup.scrape(document)
  assert scraped
    == Ok([
      ["Is it good? Yes I think it might be!"],
      ["Low memory use even for large documents."],
    ])

  // The `descendant` function can be used to make a more complex query that
  // matches elements within some other element.
  // This query matches any `p` element that is within an `aside` element.
  let scraped =
    soup.element([soup.with_tag("aside")])
    |> soup.descendant([soup.with_tag("p")])
    |> soup.return(soup.text_content())
    |> soup.scrape(document)
  assert scraped == Ok(["Low memory use even for large documents."])

  // Often we need to extract multiple things from one element.
  // To do this we can combine multiple scrapers into one:
  let id_and_text = {
    use attrs, text <- soup.merge2(soup.attributes(), soup.text_content())
    let id = list.key_find(attrs, "id") |> result.unwrap("<no id>")
    "#" <> id <> ": " <> string.join(text, "\n")
  }
  let scraped =
    soup.element([soup.with_tag("h1")])
    |> soup.return(id_and_text)
    |> soup.scrape(document)
  assert scraped == Ok("title: Presentable Soup")

  // More complex scrapers can be combined to get data from multiple
  // elements within a query.
  let document =
    "
<div class='pokemon' data-type='grass'>
  <title>Bulbasaur</title>
  A chill leafy guy.
</div>
<div class='pokemon' data-type='fire'>
  <title>Charmander</title>
  Creates steam when it rains.
</div>
<div class='pokemon' data-type='water'>
  <title>Squirtle</title>
  Looks rad in sunglasses.
</div>
"
  let pokemon_name =
    soup.element([soup.with_tag("title")])
    |> soup.return(soup.text_content())
    |> soup.map(string.concat)
  let pokemon_type =
    soup.attributes()
    |> soup.try_map(list.key_find(_, "type"))
  let pokemon = {
    use name, type_ <- soup.merge2(pokemon_name, pokemon_type)
    Pokemon(name:, type_:)
  }
  let scraped =
    soup.elements([soup.with_class("pokemon")])
    |> soup.return(pokemon)
    |> soup.scrape(document)
  assert scraped
    == Ok([
      Pokemon(name: "Bulbasaur", type_: "grass"),
      Pokemon(name: "Charmander", type_: "fire"),
      Pokemon(name: "Squirtle", type_: "water"),
    ])
}

pub type Pokemon {
  Pokemon(name: String, type_: String)
}

// The returned elements can be rendered as HTML. This is especially useful
// for snapshot testing!
// Don't test your generated HTML by looking for sub-strings, instead query
// for the parts of the page that matter for each test and then snapshot it
// with a library like Giacomo Cavalieri's Birdie.
pub fn contact_page_test() {
  let webpage = my_app.handle_request("/contact")

  // Query the page. In this test I want to focus on the contact form.
  let assert Ok(found) =
    soup.elements([soup.with_tag("form"), soup.with_class("contact-form")])
    |> soup.return(soup.element_tree())
    |> soup.scrape(webpage)

  // Render the matched HTML, create a descriptive snapshot string, and
  // snapshot it!
  let snapshot =
    "Contact page `form` with class `contact-form`\n\n"
    <> soup.elements_to_string(found)
  birdie.snap("contact page form", snapshot)
}

Further documentation can be found at https://hexdocs.pm/presentable_soup.

Thanks

A huge thank you to Zachary Dean for making htmerl, the excellent streaming HTML parser this package uses.

Name		Name	Last commit message	Last commit date
Latest commit History 21 Commits
.github/workflows		.github/workflows
birdie_snapshots		birdie_snapshots
src		src
test		test
.gitignore		.gitignore
CHANGELOG.md		CHANGELOG.md
README.md		README.md
gleam.toml		gleam.toml
manifest.toml		manifest.toml

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

Presentable Soup

Thanks

About

Uh oh!

Releases 2

Contributors

Uh oh!

Languages

Folders and files

Latest commit

History

Repository files navigation

Presentable Soup

Thanks

About

Topics

Resources

Uh oh!

Stars

Watchers

Forks

Releases 2

Contributors

Uh oh!

Languages