scrape image links as well (they do not need auth)
This commit is contained in:
parent
7649026b91
commit
79250a53d3
@ -34,6 +34,7 @@ pub struct Footprint {
|
|||||||
pub url: String,
|
pub url: String,
|
||||||
pub date: Date,
|
pub date: Date,
|
||||||
pub page: u8,
|
pub page: u8,
|
||||||
|
pub images: Vec<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
const ISO8601_DATE: u128 = iso8601::Config::DEFAULT
|
const ISO8601_DATE: u128 = iso8601::Config::DEFAULT
|
||||||
@ -54,11 +55,23 @@ impl Footprint {
|
|||||||
page_url(root_url, self.page),
|
page_url(root_url, self.page),
|
||||||
self.page
|
self.page
|
||||||
);
|
);
|
||||||
|
// injection again
|
||||||
|
let content = self
|
||||||
|
.images
|
||||||
|
.iter()
|
||||||
|
.map(|x| format!("<img src=\"{}\" /><br />", x))
|
||||||
|
.fold(String::new(), |mut a, b| {
|
||||||
|
a.reserve(b.len());
|
||||||
|
a.push_str(&b);
|
||||||
|
a
|
||||||
|
});
|
||||||
|
|
||||||
ItemBuilder::default()
|
ItemBuilder::default()
|
||||||
.title(Some(self.title))
|
.title(Some(self.title))
|
||||||
.pub_date(self.date.format(&Iso8601::<ISO8601_DATE>).ok())
|
.pub_date(self.date.format(&Iso8601::<ISO8601_DATE>).ok())
|
||||||
.link(Some(self.url.clone()))
|
.link(Some(self.url.clone()))
|
||||||
.description(Some(desc))
|
.description(Some(desc))
|
||||||
|
.content(Some(content))
|
||||||
.guid(Some(GuidBuilder::default().value(self.url).build()))
|
.guid(Some(GuidBuilder::default().value(self.url).build()))
|
||||||
.build()
|
.build()
|
||||||
}
|
}
|
||||||
|
@ -28,8 +28,6 @@ pub async fn fetch_footprints(feed_url: &str) -> Result<Vec<Footprint>, ScrapeEr
|
|||||||
let footprint_title_selector = Selector::parse("div.title > h2.headline > a")?;
|
let footprint_title_selector = Selector::parse("div.title > h2.headline > a")?;
|
||||||
let footprint_date_selector = Selector::parse("div.title > span.date > span.desc")?;
|
let footprint_date_selector = Selector::parse("div.title > span.date > span.desc")?;
|
||||||
let more_selector = Selector::parse("a#footprintListLoadMore")?;
|
let more_selector = Selector::parse("a#footprintListLoadMore")?;
|
||||||
let text_selector = Selector::parse("div.text > p")?;
|
|
||||||
let text_rest_selector = Selector::parse("div.text > p > span.rest")?;
|
|
||||||
|
|
||||||
let mut footprints: Vec<Footprint> = Vec::new();
|
let mut footprints: Vec<Footprint> = Vec::new();
|
||||||
let mut has_more_pages = true;
|
let mut has_more_pages = true;
|
||||||
@ -44,8 +42,6 @@ pub async fn fetch_footprints(feed_url: &str) -> Result<Vec<Footprint>, ScrapeEr
|
|||||||
&x,
|
&x,
|
||||||
&footprint_title_selector,
|
&footprint_title_selector,
|
||||||
&footprint_date_selector,
|
&footprint_date_selector,
|
||||||
&text_selector,
|
|
||||||
&text_rest_selector,
|
|
||||||
page,
|
page,
|
||||||
)
|
)
|
||||||
}));
|
}));
|
||||||
@ -61,26 +57,20 @@ fn parse_footprint(
|
|||||||
footprint_el: &ElementRef,
|
footprint_el: &ElementRef,
|
||||||
footprint_title_selector: &Selector,
|
footprint_title_selector: &Selector,
|
||||||
footprint_date_selector: &Selector,
|
footprint_date_selector: &Selector,
|
||||||
text_selector: &Selector,
|
|
||||||
text_rest_selector: &Selector,
|
|
||||||
page: u8,
|
page: u8,
|
||||||
) -> Option<Footprint> {
|
) -> Option<Footprint> {
|
||||||
let title_el = footprint_el.select(footprint_title_selector).next()?;
|
let title_el = footprint_el.select(footprint_title_selector).next()?;
|
||||||
let title = title_el.text().next()?.to_string();
|
let title = single_text_from_el(&title_el)?;
|
||||||
let url = title_el.value().attr("href")?.to_string();
|
let url = attr_from_el(&title_el, "href")?;
|
||||||
let date = footprint_el
|
let date_el = footprint_el.select(footprint_date_selector).next()?;
|
||||||
.select(footprint_date_selector)
|
let date = attr_from_el(&date_el, "content")?;
|
||||||
.next()?
|
|
||||||
.value()
|
|
||||||
.attr("content")?;
|
|
||||||
let format = format_description::parse("[year]-[month]-[day]").ok()?;
|
let format = format_description::parse("[year]-[month]-[day]").ok()?;
|
||||||
let date = Date::parse(date, &format).ok()?;
|
let date = Date::parse(&date, &format).ok()?;
|
||||||
|
|
||||||
let text = if let Some(text) = footprint_el.select(text_selector).next() {
|
let text = if let Some(text) = single_text_from_selector(footprint_el, "div.text > p") {
|
||||||
let text = text.text().next()?.to_string();
|
if let Some(text_rest) = single_text_from_selector(footprint_el, "div.text > p > span.rest")
|
||||||
|
{
|
||||||
if let Some(text_rest) = footprint_el.select(text_rest_selector).next() {
|
format!("{}{}", text, text_rest)
|
||||||
format!("{}{}", text, text_rest.text().next()?)
|
|
||||||
} else {
|
} else {
|
||||||
text
|
text
|
||||||
}
|
}
|
||||||
@ -90,15 +80,55 @@ fn parse_footprint(
|
|||||||
.trim()
|
.trim()
|
||||||
.to_string();
|
.to_string();
|
||||||
|
|
||||||
|
let images = scrape_img_links(&footprint_el).ok()?;
|
||||||
|
|
||||||
Some(Footprint {
|
Some(Footprint {
|
||||||
title,
|
title,
|
||||||
url,
|
url,
|
||||||
date,
|
date,
|
||||||
text,
|
text,
|
||||||
page,
|
page,
|
||||||
|
images,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn single_text_from_selector(el: &ElementRef, selector: &str) -> Option<String> {
|
||||||
|
let selector = Selector::parse(selector).ok()?;
|
||||||
|
let selected = el.select(&selector).next()?;
|
||||||
|
|
||||||
|
single_text_from_el(&selected)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn single_text_from_el(el: &ElementRef) -> Option<String> {
|
||||||
|
el.text().next().map(|x| x.to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn attr_from_el(el: &ElementRef, attribute: &str) -> Option<String> {
|
||||||
|
Some(el.value().attr(attribute)?.to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn scrape_img_links(footprint_el: &ElementRef) -> Result<Vec<String>, ScrapeError> {
|
||||||
|
let main_img_selector = Selector::parse("div.images-container > a.image")?;
|
||||||
|
let other_img_selector = Selector::parse("div.images-container > div.thumbs > a.image.thumb")?;
|
||||||
|
|
||||||
|
let mut img_urls = Vec::new();
|
||||||
|
|
||||||
|
if let Some(main_img) = footprint_el.select(&main_img_selector).next() {
|
||||||
|
if let Some(main_img_url) = attr_from_el(&main_img, "data-url") {
|
||||||
|
img_urls.push(main_img_url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for img in footprint_el.select(&other_img_selector) {
|
||||||
|
if let Some(main_img_url) = attr_from_el(&img, "data-url") {
|
||||||
|
img_urls.push(main_img_url);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let img_urls = img_urls.iter().map(|x| format!("https:{}", x)).collect();
|
||||||
|
|
||||||
|
Ok(img_urls)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn page_url(feed_url: &str, page: u8) -> String {
|
pub fn page_url(feed_url: &str, page: u8) -> String {
|
||||||
let connector = if feed_url.contains('?') { "&" } else { "?" };
|
let connector = if feed_url.contains('?') { "&" } else { "?" };
|
||||||
format!("{}{}page={}&sort=ASC", feed_url, connector, page)
|
format!("{}{}page={}&sort=ASC", feed_url, connector, page)
|
||||||
|
Loading…
Reference in New Issue
Block a user