Skip to content

Commit 526cfcb

Browse files
authored
Web crate (#714)
* issue-620 user agent is required * showing that crate_id is the crate in the params * Introduce rust files as tested modules * web crate uses modules * removed extra file * broken module checks links with threads with easy interface * test the cookbook * scraping complete
1 parent 57442b2 commit 526cfcb

File tree

11 files changed

+321
-189
lines changed

11 files changed

+321
-189
lines changed

crates/web/Cargo.toml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
[package]
2+
name = "web"
3+
version = "0.1.0"
4+
edition = "2024"
5+
6+
[dependencies]
7+
anyhow = "1.0.94"
8+
mime = "0.3.17"
9+
regex = "1.11.1"
10+
reqwest = { version = "0.12.9", features = ["blocking", "json", "stream"] }
11+
select = "0.6.0"
12+
serde = { version = "1.0.215", features = ["derive"] }
13+
tempfile = "3.14.0"
14+
thiserror = "2.0.5"
15+
tokio = { version = "1.42.0", features = ["full"] }
16+
url = "2.5.4"

crates/web/README.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
To see the test output run
2+
3+
```bash
4+
cargo test -- --nocapture
5+
```
6+
7+
To test the wiki module run
8+
9+
```bash
10+
cargo test wiki -- --nocapture
11+
```

crates/web/src/broken.rs

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
use thiserror::Error;
2+
use reqwest::StatusCode;
3+
use select::document::Document;
4+
use select::predicate::Name;
5+
use std::collections::HashSet;
6+
use url::{Position, Url};
7+
8+
#[derive(Error, Debug)]
9+
pub enum BrokenError {
10+
#[error("Reqwest error: {0}")]
11+
ReqError(#[from] reqwest::Error),
12+
#[error("IO error: {0}")]
13+
IoError(#[from] std::io::Error),
14+
#[error("URL parse error: {0}")]
15+
UrlParseError(#[from] url::ParseError),
16+
#[error("Join error: {0}")]
17+
JoinError(#[from] tokio::task::JoinError),
18+
}
19+
20+
pub struct CategorizedUrls {
21+
pub ok: Vec<String>,
22+
pub broken: Vec<String>,
23+
}
24+
25+
enum Link {
26+
GoodLink(Url),
27+
BadLink(Url),
28+
}
29+
30+
async fn get_base_url(url: &Url, doc: &Document) -> Result<Url, BrokenError> {
31+
let base_tag_href = doc.find(Name("base")).filter_map(|n| n.attr("href")).nth(0);
32+
let base_url =
33+
base_tag_href.map_or_else(|| Url::parse(&url[..Position::BeforePath]), Url::parse)?;
34+
Ok(base_url)
35+
}
36+
37+
async fn check_link(url: &Url) -> Result<bool, BrokenError> {
38+
let res = reqwest::get(url.as_ref()).await?;
39+
Ok(res.status() != StatusCode::NOT_FOUND)
40+
}
41+
42+
pub async fn check(site: &str) -> Result<CategorizedUrls, BrokenError> {
43+
let url = Url::parse(site)?;
44+
let res = reqwest::get(url.as_ref()).await?.text().await?;
45+
let document = Document::from(res.as_str());
46+
let base_url = get_base_url(&url, &document).await?;
47+
let base_parser = Url::options().base_url(Some(&base_url));
48+
let links: HashSet<Url> = document
49+
.find(Name("a"))
50+
.filter_map(|n| n.attr("href"))
51+
.filter_map(|link| base_parser.parse(link).ok())
52+
.collect();
53+
let mut tasks = vec![];
54+
let mut ok = vec![];
55+
let mut broken = vec![];
56+
57+
for link in links {
58+
tasks.push(tokio::spawn(async move {
59+
if check_link(&link).await.unwrap_or(false) {
60+
Link::GoodLink(link)
61+
} else {
62+
Link::BadLink(link)
63+
}
64+
}));
65+
}
66+
67+
for task in tasks {
68+
match task.await? {
69+
Link::GoodLink(link) => ok.push(link.to_string()),
70+
Link::BadLink(link) => broken.push(link.to_string()),
71+
}
72+
}
73+
74+
Ok(CategorizedUrls { ok, broken })
75+
}
76+

crates/web/src/lib.rs

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
mod broken;
2+
mod paginated;
3+
mod links;
4+
mod wiki;
5+
6+
#[cfg(test)]
7+
mod tests {
8+
use super::*;
9+
10+
#[test]
11+
fn test_reverse_dependencies() -> reqwest::Result<()> {
12+
for dep in paginated::ReverseDependencies::of("serde")?.take(5) {
13+
let dependency = dep?;
14+
println!("{} depends on {}", dependency.id, dependency.crate_id);
15+
}
16+
Ok(())
17+
}
18+
19+
#[tokio::test]
20+
async fn test_links() -> Result<(), links::LinkError> {
21+
let page_links = links::get_links("https://rust-lang-nursery.github.io/rust-cookbook/").await?;
22+
for link in page_links {
23+
println!("{}", link);
24+
}
25+
Ok(())
26+
}
27+
28+
#[tokio::test]
29+
async fn test_broken() -> Result<(), broken::BrokenError> {
30+
let categorized = broken::check("https://rust-lang-nursery.github.io/rust-cookbook/web/scraping.html").await?;
31+
println!("OK: {:?}", categorized.ok);
32+
println!("Broken: {:?}", categorized.broken);
33+
Ok(())
34+
}
35+
36+
#[tokio::test]
37+
async fn test_wiki() -> anyhow::Result<()> {
38+
let content = reqwest::get(
39+
"https://en.wikipedia.org/w/index.php?title=Rust_(programming_language)&action=raw",
40+
)
41+
.await?
42+
.text()
43+
.await?;
44+
45+
println!("{:#?}", wiki::extract_links(content.as_str()));
46+
47+
Ok(())
48+
}
49+
}

crates/web/src/links.rs

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
use thiserror::Error;
2+
use select::document::Document;
3+
use select::predicate::Name;
4+
5+
#[derive(Error, Debug)]
6+
pub enum LinkError {
7+
#[error("Reqwest error: {0}")]
8+
ReqError(#[from] reqwest::Error),
9+
#[error("IO error: {0}")]
10+
IoError(#[from] std::io::Error),
11+
}
12+
13+
pub async fn get_links(page: &str) -> Result<Vec<Box<str>>, LinkError> {
14+
let res = reqwest::get(page)
15+
.await?
16+
.text()
17+
.await?;
18+
19+
let links = Document::from(res.as_str())
20+
.find(Name("a"))
21+
.filter_map(|node| node.attr("href"))
22+
.into_iter()
23+
.map(|link| Box::<str>::from(link.to_string()))
24+
.collect();
25+
26+
Ok(links)
27+
}
28+

crates/web/src/paginated.rs

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
use reqwest::Result;
2+
use reqwest::header::USER_AGENT;
3+
use serde::Deserialize;
4+
5+
#[derive(Deserialize)]
6+
struct ApiResponse {
7+
dependencies: Vec<Dependency>,
8+
meta: Meta,
9+
}
10+
11+
#[derive(Deserialize)]
12+
pub struct Dependency {
13+
pub crate_id: String,
14+
pub id: u32,
15+
}
16+
17+
#[derive(Deserialize)]
18+
struct Meta {
19+
total: u32,
20+
}
21+
22+
pub struct ReverseDependencies {
23+
crate_id: String,
24+
dependencies: <Vec<Dependency> as IntoIterator>::IntoIter,
25+
client: reqwest::blocking::Client,
26+
page: u32,
27+
per_page: u32,
28+
total: u32,
29+
}
30+
31+
impl ReverseDependencies {
32+
pub fn of(crate_id: &str) -> Result<Self> {
33+
Ok(ReverseDependencies {
34+
crate_id: crate_id.to_owned(),
35+
dependencies: vec![].into_iter(),
36+
client: reqwest::blocking::Client::new(),
37+
page: 0,
38+
per_page: 100,
39+
total: 0,
40+
})
41+
}
42+
43+
fn try_next(&mut self) -> Result<Option<Dependency>> {
44+
if let Some(dep) = self.dependencies.next() {
45+
return Ok(Some(dep));
46+
}
47+
48+
if self.page > 0 && self.page * self.per_page >= self.total {
49+
return Ok(None);
50+
}
51+
52+
self.page += 1;
53+
let url = format!("https://crates.io/api/v1/crates/{}/reverse_dependencies?page={}&per_page={}",
54+
self.crate_id,
55+
self.page,
56+
self.per_page);
57+
println!("{}", url);
58+
59+
let response = self.client.get(&url).header(
60+
USER_AGENT,
61+
"cookbook agent",
62+
).send()?.json::<ApiResponse>()?;
63+
self.dependencies = response.dependencies.into_iter();
64+
self.total = response.meta.total;
65+
Ok(self.dependencies.next())
66+
}
67+
}
68+
69+
impl Iterator for ReverseDependencies {
70+
type Item = Result<Dependency>;
71+
72+
fn next(&mut self) -> Option<Self::Item> {
73+
match self.try_next() {
74+
Ok(Some(dep)) => Some(Ok(dep)),
75+
Ok(None) => None,
76+
Err(err) => Some(Err(err)),
77+
}
78+
}
79+
}

crates/web/src/wiki.rs

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
use regex::Regex;
2+
use std::borrow::Cow;
3+
use std::collections::HashSet;
4+
use std::sync::LazyLock;
5+
6+
pub fn extract_links(content: &str) -> HashSet<Cow<str>> {
7+
static WIKI_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(
8+
r"(?x)
9+
\[\[(?P<internal>[^\[\]|]*)[^\[\]]*\]\] # internal links
10+
|
11+
(url=|URL\||\[)(?P<external>http.*?)[ \|}] # external links
12+
"
13+
)
14+
.unwrap()
15+
);
16+
17+
let links: HashSet<_> = WIKI_REGEX
18+
.captures_iter(content)
19+
.map(|c| match (c.name("internal"), c.name("external")) {
20+
(Some(val), None) => Cow::from(val.as_str()),
21+
(None, Some(val)) => Cow::from(val.as_str()),
22+
_ => unreachable!(),
23+
})
24+
.collect::<HashSet<_>>();
25+
26+
links
27+
}

src/web/clients/api/paginated.md

Lines changed: 7 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -3,87 +3,17 @@
33
[![reqwest-badge]][reqwest] [![serde-badge]][serde] [![cat-net-badge]][cat-net] [![cat-encoding-badge]][cat-encoding]
44

55
Wraps a paginated web API in a convenient Rust iterator. The iterator lazily
6-
fetches the next page of results from the remote server as it arrives at the end
7-
of each page.
6+
fetches the next page of results from the remote server as it arrives at the end of each page.
87

9-
```rust,edition2018,no_run
10-
use reqwest::Result;
11-
use serde::Deserialize;
12-
13-
#[derive(Deserialize)]
14-
struct ApiResponse {
15-
dependencies: Vec<Dependency>,
16-
meta: Meta,
17-
}
18-
19-
#[derive(Deserialize)]
20-
struct Dependency {
21-
crate_id: String,
22-
}
23-
24-
#[derive(Deserialize)]
25-
struct Meta {
26-
total: u32,
27-
}
28-
29-
struct ReverseDependencies {
30-
crate_id: String,
31-
dependencies: <Vec<Dependency> as IntoIterator>::IntoIter,
32-
client: reqwest::blocking::Client,
33-
page: u32,
34-
per_page: u32,
35-
total: u32,
36-
}
37-
38-
impl ReverseDependencies {
39-
fn of(crate_id: &str) -> Result<Self> {
40-
Ok(ReverseDependencies {
41-
crate_id: crate_id.to_owned(),
42-
dependencies: vec![].into_iter(),
43-
client: reqwest::blocking::Client::new(),
44-
page: 0,
45-
per_page: 100,
46-
total: 0,
47-
})
48-
}
49-
50-
fn try_next(&mut self) -> Result<Option<Dependency>> {
51-
if let Some(dep) = self.dependencies.next() {
52-
return Ok(Some(dep));
53-
}
54-
55-
if self.page > 0 && self.page * self.per_page >= self.total {
56-
return Ok(None);
57-
}
58-
59-
self.page += 1;
60-
let url = format!("https://crates.io/api/v1/crates/{}/reverse_dependencies?page={}&per_page={}",
61-
self.crate_id,
62-
self.page,
63-
self.per_page);
64-
65-
let response = self.client.get(&url).send()?.json::<ApiResponse>()?;
66-
self.dependencies = response.dependencies.into_iter();
67-
self.total = response.meta.total;
68-
Ok(self.dependencies.next())
69-
}
70-
}
71-
72-
impl Iterator for ReverseDependencies {
73-
type Item = Result<Dependency>;
74-
75-
fn next(&mut self) -> Option<Self::Item> {
76-
match self.try_next() {
77-
Ok(Some(dep)) => Some(Ok(dep)),
78-
Ok(None) => None,
79-
Err(err) => Some(Err(err)),
80-
}
81-
}
8+
```rust,edition2024,no_run
9+
mod paginated {
10+
{{#include ../../../../crates/web/src/paginated.rs}}
8211
}
8312
8413
fn main() -> Result<()> {
85-
for dep in ReverseDependencies::of("serde")? {
86-
println!("reverse dependency: {}", dep?.crate_id);
14+
for dep in paginated::ReverseDependencies::of("serde")? {
15+
let dependency = dep?;
16+
println!("{} depends on {}", dependency.id, dependency.crate_id);
8717
}
8818
Ok(())
8919
}

0 commit comments

Comments
 (0)