aidoku/imports/html.rs
1//! Module for working with HTML.
2//!
3//! It provides a convenient API for extracting data, using HTML5
4//! DOM methods and CSS selectors.
5//!
6//! The backend of this module is [SwiftSoup](https://github.com/scinfu/SwiftSoup).
7use super::{
8 FFIResult, Rid,
9 std::{destroy, read_string_and_destroy},
10};
11use crate::alloc::{String, Vec};
12use core::fmt::Display;
13
14#[link(wasm_import_module = "html")]
15unsafe extern "C" {
16 fn parse(
17 html: *const u8,
18 html_len: usize,
19 base_url: *const u8,
20 base_url_len: usize,
21 ) -> FFIResult;
22 fn parse_fragment(
23 html: *const u8,
24 html_len: usize,
25 base_url: *const u8,
26 base_url_len: usize,
27 ) -> FFIResult;
28 fn escape(text: *const u8, text_len: usize) -> FFIResult;
29 fn unescape(text: *const u8, text_len: usize) -> FFIResult;
30
31 fn kind(rid: Rid) -> FFIResult;
32
33 fn child_nodes(rid: Rid) -> FFIResult;
34 fn has_attr(rid: Rid, attr: *const u8, attr_len: usize) -> bool;
35 fn set_attr(
36 rid: Rid,
37 key: *const u8,
38 key_len: usize,
39 value: *const u8,
40 value_len: usize,
41 ) -> FFIResult;
42 fn remove_attr(rid: Rid, attr: *const u8, attr_len: usize) -> FFIResult;
43
44 fn set_text(rid: Rid, text: *const u8, text_len: usize) -> FFIResult;
45 fn set_html(rid: Rid, html: *const u8, html_len: usize) -> FFIResult;
46 fn prepend(rid: Rid, html: *const u8, html_len: usize) -> FFIResult;
47 fn append(rid: Rid, html: *const u8, html_len: usize) -> FFIResult;
48 fn children(rid: Rid) -> FFIResult;
49 fn base_uri(rid: Rid) -> FFIResult;
50 fn own_text(rid: Rid) -> FFIResult;
51 fn data(rid: Rid) -> FFIResult;
52 fn id(rid: Rid) -> FFIResult;
53 fn tag_name(rid: Rid) -> FFIResult;
54 fn class_name(rid: Rid) -> FFIResult;
55 fn has_class(rid: Rid, class: *const u8, class_len: usize) -> bool;
56 fn add_class(rid: Rid, class: *const u8, class_len: usize) -> FFIResult;
57 fn remove_class(rid: Rid, class: *const u8, class_len: usize) -> FFIResult;
58
59 fn first(rid: Rid) -> FFIResult;
60 fn last(rid: Rid) -> FFIResult;
61 #[allow(clashing_extern_declarations)]
62 #[link_name = "get"]
63 fn html_get(rid: Rid, index: usize) -> FFIResult;
64 fn size(rid: Rid) -> FFIResult;
65
66 fn parent(rid: Rid) -> FFIResult;
67 fn siblings(rid: Rid) -> FFIResult;
68 fn next(rid: Rid) -> FFIResult;
69 fn previous(rid: Rid) -> FFIResult;
70
71 fn attr(rid: Rid, key: *const u8, key_len: usize) -> FFIResult;
72 fn outer_html(rid: Rid) -> FFIResult;
73 fn remove(rid: Rid) -> FFIResult;
74
75 fn select(rid: Rid, query: *const u8, query_len: usize) -> FFIResult;
76 fn select_first(rid: Rid, query: *const u8, query_len: usize) -> FFIResult;
77 fn text(rid: Rid) -> FFIResult;
78 fn untrimmed_text(rid: Rid) -> FFIResult;
79 fn html(rid: Rid) -> FFIResult;
80}
81
82/// Error type for HTML operations.
83#[derive(PartialEq, Eq, Debug, Clone)]
84pub enum HtmlError {
85 InvalidDescriptor,
86 InvalidString,
87 InvalidHtml,
88 InvalidQuery,
89 NoResult,
90 SwiftSoupError,
91}
92
93impl HtmlError {
94 fn from(value: FFIResult) -> Option<Self> {
95 match value {
96 -1 => Some(Self::InvalidDescriptor),
97 -2 => Some(Self::InvalidString),
98 -3 => Some(Self::InvalidHtml),
99 -4 => Some(Self::InvalidQuery),
100 -5 => Some(Self::NoResult),
101 -6 => Some(Self::SwiftSoupError),
102 _ => None,
103 }
104 }
105}
106
107/// Namespace for HTML-related functions.
108#[derive(Debug)]
109pub struct Html;
110
111impl Html {
112 /// Parse HTML into a Document.
113 ///
114 /// As there is no base URL specified, absolute URL resolution requires the
115 /// HTML to have a `<base href>` tag.
116 pub fn parse<T: AsRef<[u8]>>(html: T) -> Result<Document, HtmlError> {
117 let buf = html.as_ref();
118 let rid = unsafe { parse(buf.as_ptr(), buf.len(), "".as_ptr(), 0) };
119 if let Some(error) = HtmlError::from(rid) {
120 Err(error)
121 } else {
122 Ok(Document(unsafe { Element::from(rid) }))
123 }
124 }
125
126 /// Parse HTML into a Document, with a base URL.
127 ///
128 /// The given `base_url` will be used for any URLs that occurs before a
129 /// `<base href>` tag is defined.
130 pub fn parse_with_url<T: AsRef<[u8]>, B: AsRef<str>>(
131 html: T,
132 base_url: B,
133 ) -> Result<Document, HtmlError> {
134 let buf = html.as_ref();
135 let url = base_url.as_ref();
136 let rid = unsafe { parse(buf.as_ptr(), buf.len(), url.as_ptr(), url.len()) };
137 if let Some(error) = HtmlError::from(rid) {
138 Err(error)
139 } else {
140 Ok(Document(unsafe { Element::from(rid) }))
141 }
142 }
143
144 /// Parse a HTML fragment, assuming that it forms the `body` of the HTML.
145 ///
146 /// Similar to [Html::parse], relative URLs will not be resolved unless
147 /// there is a `<base href>` tag.
148 pub fn parse_fragment<T: AsRef<[u8]>>(html: T) -> Result<Document, HtmlError> {
149 let buf = html.as_ref();
150 let rid = unsafe { parse_fragment(buf.as_ptr(), buf.len(), "".as_ptr(), 0) };
151 if let Some(error) = HtmlError::from(rid) {
152 Err(error)
153 } else {
154 Ok(Document(unsafe { Element::from(rid) }))
155 }
156 }
157
158 /// Parse a HTML fragment, assuming that it forms the `body` of the HTML, with a base URL.
159 ///
160 /// Similar to [Html::parse_with_url], URL resolution occurs for any that appears
161 /// before a `<base href>` tag.
162 pub fn parse_fragment_with_url<T: AsRef<[u8]>, B: AsRef<str>>(
163 html: T,
164 base_url: B,
165 ) -> Result<Document, HtmlError> {
166 let buf = html.as_ref();
167 let url = base_url.as_ref();
168 let rid = unsafe { parse_fragment(buf.as_ptr(), buf.len(), url.as_ptr(), url.len()) };
169 if let Some(error) = HtmlError::from(rid) {
170 Err(error)
171 } else {
172 Ok(Document(unsafe { Element::from(rid) }))
173 }
174 }
175
176 /// Escape any HTML-reserved characters to HTML entities.
177 ///
178 /// # Examples
179 /// ```ignore
180 /// use aidoku::imports::html::Html;
181 /// assert_eq!(
182 /// Html::escape("Hello &<> Å å π 新 there ¾ © »"),
183 /// "Hello &<> Å å π 新 there ¾ © »",
184 /// );
185 /// ```
186 pub fn escape<T: AsRef<str>>(text: T) -> String {
187 let text = text.as_ref();
188 let rid = unsafe { escape(text.as_ptr(), text.len()) };
189 read_string_and_destroy(rid).unwrap_or_default()
190 }
191
192 /// Unescape any HTML entities to their original characters.
193 ///
194 /// # Examples
195 /// ```ignore
196 /// use aidoku::imports::html::Html;
197 /// assert_eq!(
198 /// Html::unescape("Hello &<> Å å π 新 there ¾ © »"),
199 /// Some("Hello &<> Å å π 新 there ¾ © »".into()),
200 /// );
201 /// ```
202 pub fn unescape<T: AsRef<str>>(text: T) -> Option<String> {
203 let text = text.as_ref();
204 let rid = unsafe { unescape(text.as_ptr(), text.len()) };
205 if HtmlError::from(rid).is_some() {
206 return None;
207 }
208 read_string_and_destroy(rid)
209 }
210}
211
212#[derive(PartialEq, Eq, Debug)]
213pub enum Kind {
214 Unknown,
215 Node,
216 TextNode,
217 DataNode,
218 Comment,
219 Element,
220 ElementList,
221 Document,
222}
223
224impl From<FFIResult> for Kind {
225 fn from(value: FFIResult) -> Self {
226 match value {
227 0 => Kind::Unknown,
228 1 => Kind::Node,
229 2 => Kind::TextNode,
230 3 => Kind::DataNode,
231 4 => Kind::Comment,
232 5 => Kind::Element,
233 6 => Kind::ElementList,
234 7 => Kind::Document,
235 _ => Kind::Unknown,
236 }
237 }
238}
239
240/// A single HTML node.
241pub struct Node {
242 rid: Rid,
243}
244
245impl Node {
246 /// Get an instance from a [Rid].
247 unsafe fn from(rid: Rid) -> Self {
248 Self { rid }
249 }
250
251 /// Get the kind of the node.
252 pub fn kind(&self) -> Kind {
253 unsafe { kind(self.rid) }.into()
254 }
255
256 /// Get the node's parent node, returning `None` if there isn't one.
257 pub fn parent(&self) -> Option<Node> {
258 let rid = unsafe { parent(self.rid) };
259 if HtmlError::from(rid).is_some() {
260 return None;
261 }
262 Some(unsafe { Node::from(rid) })
263 }
264
265 fn read_node_list(rid: i32) -> Vec<Node> {
266 let mut nodes = Vec::new();
267 if rid < 0 {
268 return nodes;
269 }
270 let len = unsafe { size(rid) };
271 if len <= 0 {
272 return nodes;
273 }
274 for index in 0..len as usize {
275 let node_rid = unsafe { html_get(rid, index) };
276 if node_rid < 0 {
277 continue;
278 }
279 nodes.push(unsafe { Node::from(node_rid) });
280 }
281 nodes
282 }
283
284 /// Get the node's child nodes.
285 pub fn child_nodes(&self) -> Vec<Node> {
286 let rid = unsafe { child_nodes(self.rid) };
287 Self::read_node_list(rid)
288 }
289
290 /// Get the sibling nodes of the node.
291 pub fn siblings(&self) -> Vec<Node> {
292 let rid = unsafe { siblings(self.rid) };
293 Self::read_node_list(rid)
294 }
295
296 /// Get the next sibling of the node, returning `None` if there isn't one.
297 pub fn next(&self) -> Option<Node> {
298 let rid = unsafe { next(self.rid) };
299 if HtmlError::from(rid).is_some() {
300 return None;
301 }
302 Some(unsafe { Node::from(rid) })
303 }
304
305 /// Get the previous sibling of the node, returning `None` if there isn't one.
306 pub fn prev(&self) -> Option<Node> {
307 let rid = unsafe { previous(self.rid) };
308 if HtmlError::from(rid).is_some() {
309 return None;
310 }
311 Some(unsafe { Node::from(rid) })
312 }
313
314 /// Get the node's outer HTML.
315 pub fn outer_html(&self) -> Option<String> {
316 let rid = unsafe { outer_html(self.rid) };
317 if HtmlError::from(rid).is_some() {
318 return None;
319 }
320 read_string_and_destroy(rid)
321 }
322
323 /// Test if this node has an attribute. Case insensitive.
324 pub fn has_attr<T: AsRef<str>>(&self, attr_name: T) -> bool {
325 let attr_name = attr_name.as_ref();
326 unsafe { has_attr(self.rid, attr_name.as_ptr(), attr_name.len()) }
327 }
328
329 /// Set an attribute value on this node.
330 ///
331 /// If this node already has an attribute with the key, its value is updated;
332 /// otherwise, a new attribute is added.
333 pub fn set_attr<K: AsRef<str>, V: AsRef<str>>(
334 &mut self,
335 key: K,
336 value: V,
337 ) -> Result<(), HtmlError> {
338 let key = key.as_ref();
339 let value = value.as_ref();
340 let result = unsafe {
341 set_attr(
342 self.rid,
343 key.as_ptr(),
344 key.len(),
345 value.as_ptr(),
346 value.len(),
347 )
348 };
349
350 if let Some(error) = HtmlError::from(result) {
351 Err(error)
352 } else {
353 Ok(())
354 }
355 }
356
357 /// Remove an attribute from this node.
358 pub fn remove_attr<T: AsRef<str>>(&mut self, attr: T) -> Result<(), HtmlError> {
359 let attr = attr.as_ref();
360 let result = unsafe { remove_attr(self.rid, attr.as_ptr(), attr.len()) };
361
362 if let Some(error) = HtmlError::from(result) {
363 Err(error)
364 } else {
365 Ok(())
366 }
367 }
368
369 /// Get the text of this node, if it is a text node.
370 pub fn text(&self) -> Option<String> {
371 let kind = self.kind();
372 if !matches!(kind, Kind::TextNode | Kind::Element) {
373 return None;
374 }
375 let rid = unsafe { text(self.rid) };
376 if HtmlError::from(rid).is_some() {
377 return None;
378 }
379 read_string_and_destroy(rid)
380 }
381
382 /// Get the data of this node, if it is a data node or comment.
383 pub fn data(&self) -> Option<String> {
384 let kind = self.kind();
385 if !matches!(kind, Kind::DataNode | Kind::Comment | Kind::Element) {
386 return None;
387 }
388 let rid = unsafe { data(self.rid) };
389 if HtmlError::from(rid).is_some() {
390 return None;
391 }
392 read_string_and_destroy(rid)
393 }
394}
395
396impl Display for Node {
397 /// Returns the outer HTML of the node.
398 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
399 write!(f, "{}", self.outer_html().unwrap_or_default())
400 }
401}
402
403impl Drop for Node {
404 fn drop(&mut self) {
405 unsafe { destroy(self.rid) }
406 }
407}
408
409/// A single HTML element.
410pub struct Element(pub(crate) Node);
411
412impl Element {
413 /// Get an instance from a [Rid].
414 unsafe fn from(rid: Rid) -> Self {
415 Self(unsafe { Node::from(rid) })
416 }
417
418 /// Get the kind of the element.
419 pub fn kind(&self) -> Kind {
420 self.0.kind()
421 }
422
423 /// Find elements that match the given CSS (or JQuery) selector.
424 pub fn select<T: AsRef<str>>(&self, css_query: T) -> Option<ElementList> {
425 let query = css_query.as_ref();
426 let rid = unsafe { select(self.0.rid, query.as_ptr(), query.len()) };
427 if HtmlError::from(rid).is_some() {
428 return None;
429 }
430 Some(unsafe { ElementList::from(rid) })
431 }
432
433 /// Find the first element that matches the given CSS (or JQuery) selector.
434 pub fn select_first<T: AsRef<str>>(&self, css_query: T) -> Option<Element> {
435 let query = css_query.as_ref();
436 let rid = unsafe { select_first(self.0.rid, query.as_ptr(), query.len()) };
437 if HtmlError::from(rid).is_some() {
438 return None;
439 }
440 Some(unsafe { Element::from(rid) })
441 }
442
443 /// Get an attribute value by its key.
444 ///
445 /// To get an absolute URL from an attribute that may be a relative URL,
446 /// prefix the key with `abs:`.
447 ///
448 /// # Examples
449 /// ```ignore
450 /// use aidoku::imports::html::Html;
451 /// let html = Html::parse_with_url("<img src=\"/image.jpg\" />", "https://example.com").unwrap();
452 /// let el = html.select_first("img").unwrap();
453 /// assert_eq!(
454 /// el.attr("abs:src"),
455 /// Some("https://example.com/image.jpg".into())
456 /// );
457 /// ```
458 pub fn attr<T: AsRef<str>>(&self, attr_name: T) -> Option<String> {
459 let attr_name = attr_name.as_ref();
460 let rid = unsafe { attr(self.0.rid, attr_name.as_ptr(), attr_name.len()) };
461 if HtmlError::from(rid).is_some() {
462 return None;
463 }
464 read_string_and_destroy(rid)
465 }
466
467 /// Get the normalized, combined text of this element and its children.
468 ///
469 /// Whitespace is normalized and trimmed.
470 ///
471 /// Note that this method returns text that would be presented to a reader.
472 /// The contents of data nodes (e.g. `<script>` tags) are not considered text,
473 /// and instead, [Element::html] or [Element::data] can be used for them.
474 ///
475 /// # Examples
476 /// ```ignore
477 /// use aidoku::imports::html::Html;
478 /// let html = Html::parse("<p>Hello <b>there</b> now! </p>").unwrap();
479 /// let el = html.select_first("p").unwrap();
480 /// assert_eq!(el.text(), Some("Hello there now!".into()));
481 /// ```
482 pub fn text(&self) -> Option<String> {
483 let rid = unsafe { text(self.0.rid) };
484 if HtmlError::from(rid).is_some() {
485 return None;
486 }
487 read_string_and_destroy(rid)
488 }
489
490 /// Get the text of this element and its children.
491 ///
492 /// Whitespace is *not* normalized and trimmed.
493 ///
494 /// Notices from [Element::text] apply.
495 ///
496 /// # Examples
497 /// ```ignore
498 /// use aidoku::imports::html::Html;
499 /// let html = Html::parse("<p>Hello <b>there</b> now! </p>").unwrap();
500 /// let el = html.select_first("p").unwrap();
501 /// assert_eq!(el.untrimmed_text(), Some("Hello there now! ".into()));
502 /// ```
503 pub fn untrimmed_text(&self) -> Option<String> {
504 let rid = unsafe { untrimmed_text(self.0.rid) };
505 if HtmlError::from(rid).is_some() {
506 return None;
507 }
508 read_string_and_destroy(rid)
509 }
510
511 /// Get the element's inner HTML.
512 ///
513 /// # Examples
514 /// ```ignore
515 /// use aidoku::imports::html::Html;
516 /// let html = Html::parse("<div><p></p></div>").unwrap();
517 /// let div = html.select_first("div").unwrap();
518 /// assert_eq!(div.html(), Some("<p></p>".into()));
519 /// ```
520 pub fn html(&self) -> Option<String> {
521 let rid = unsafe { html(self.0.rid) };
522 if HtmlError::from(rid).is_some() {
523 return None;
524 }
525 read_string_and_destroy(rid)
526 }
527
528 /// Get the element's outer HTML.
529 ///
530 /// # Examples
531 /// ```ignore
532 /// use aidoku::imports::html::Html;
533 /// let html = Html::parse("<div><p></p></div>").unwrap();
534 /// let div = html.select_first("div").unwrap();
535 /// assert_eq!(div.outer_html(), Some("<div><p></p></div>".into()));
536 /// ```
537 pub fn outer_html(&self) -> Option<String> {
538 self.0.outer_html()
539 }
540
541 /// Remove this element from the DOM tree.
542 pub fn remove(self) {
543 _ = unsafe { remove(self.0.rid) };
544 }
545
546 /// Get the element's parent element, returning `None` if there isn't one.
547 pub fn parent(&self) -> Option<Element> {
548 let rid = unsafe { parent(self.0.rid) };
549 if HtmlError::from(rid).is_some() {
550 return None;
551 }
552 Some(unsafe { Element::from(rid) })
553 }
554
555 /// Get the elements's child nodes.
556 pub fn child_nodes(&self) -> Vec<Node> {
557 self.0.child_nodes()
558 }
559
560 /// Get the element's children elements.
561 pub fn children(&self) -> ElementList {
562 let rid = unsafe { children(self.0.rid) };
563 unsafe { ElementList::from(rid) }
564 }
565
566 /// Get the sibling elements of the element.
567 pub fn siblings(&self) -> ElementList {
568 let rid = unsafe { siblings(self.0.rid) };
569 unsafe { ElementList::from(rid) }
570 }
571
572 /// Get the next sibling element of the element, returning `None` if there isn't one.
573 pub fn next(&self) -> Option<Element> {
574 let rid = unsafe { next(self.0.rid) };
575 if HtmlError::from(rid).is_some() {
576 return None;
577 }
578 Some(unsafe { Element::from(rid) })
579 }
580
581 /// Get the previous sibling element of the element, returning `None` if there isn't one.
582 pub fn prev(&self) -> Option<Element> {
583 let rid = unsafe { previous(self.0.rid) };
584 if HtmlError::from(rid).is_some() {
585 return None;
586 }
587 Some(unsafe { Element::from(rid) })
588 }
589
590 /// Set the element's text content, clearing any existing content.
591 pub fn set_text<T: AsRef<str>>(&mut self, text: T) -> Result<(), HtmlError> {
592 let text = text.as_ref();
593 let result = unsafe { set_text(self.0.rid, text.as_ptr(), text.len()) };
594
595 if let Some(error) = HtmlError::from(result) {
596 Err(error)
597 } else {
598 Ok(())
599 }
600 }
601
602 /// Set the element's inner HTML, clearing the existing HTML.
603 pub fn set_html<T: AsRef<str>>(&mut self, text: T) -> Result<(), HtmlError> {
604 let text = text.as_ref();
605 let result = unsafe { set_html(self.0.rid, text.as_ptr(), text.len()) };
606
607 if let Some(error) = HtmlError::from(result) {
608 Err(error)
609 } else {
610 Ok(())
611 }
612 }
613
614 /// Prepend inner HTML into this element.
615 ///
616 /// The given HTML will be parsed, and each node prepended to the start
617 /// of the element's children.
618 pub fn prepend<T: AsRef<str>>(&mut self, text: T) -> Result<(), HtmlError> {
619 let text = text.as_ref();
620 let result = unsafe { prepend(self.0.rid, text.as_ptr(), text.len()) };
621
622 if let Some(error) = HtmlError::from(result) {
623 Err(error)
624 } else {
625 Ok(())
626 }
627 }
628
629 /// Append inner HTML into this element.
630 ///
631 /// The given HTML will be parsed, and each node appended to the end
632 /// of the element's children.
633 pub fn append<T: AsRef<str>>(&mut self, text: T) -> Result<(), HtmlError> {
634 let text = text.as_ref();
635 let result = unsafe { append(self.0.rid, text.as_ptr(), text.len()) };
636
637 if let Some(error) = HtmlError::from(result) {
638 Err(error)
639 } else {
640 Ok(())
641 }
642 }
643
644 /// Get the base URI of this Element.
645 pub fn base_uri(&self) -> Option<String> {
646 let rid = unsafe { base_uri(self.0.rid) };
647 if HtmlError::from(rid).is_some() {
648 return None;
649 }
650 read_string_and_destroy(rid)
651 }
652
653 /// Gets the (normalized) text owned by this element.
654 pub fn own_text(&self) -> Option<String> {
655 let rid = unsafe { own_text(self.0.rid) };
656 if HtmlError::from(rid).is_some() {
657 return None;
658 }
659 read_string_and_destroy(rid)
660 }
661
662 /// Get the combined data (e.g. the inside of a `<script>` tag) of this element.
663 ///
664 /// Note that data is NOT the text of the element. Use [Element::text]
665 /// to get the text that would be visible to a user, and [Element::data]
666 /// for the contents of scripts, comments, CSS styles, etc.
667 pub fn data(&self) -> Option<String> {
668 let rid = unsafe { data(self.0.rid) };
669 if HtmlError::from(rid).is_some() {
670 return None;
671 }
672 read_string_and_destroy(rid)
673 }
674
675 /// Get the `id` attribute of this element.
676 pub fn id(&self) -> Option<String> {
677 let rid = unsafe { id(self.0.rid) };
678 if HtmlError::from(rid).is_some() {
679 return None;
680 }
681 read_string_and_destroy(rid)
682 }
683
684 /// Get the name of the tag for this element.
685 ///
686 /// This will always be the lowercased version. For example, `<DIV>` and
687 /// `<div>` would both return `div`.
688 pub fn tag_name(&self) -> Option<String> {
689 let rid = unsafe { tag_name(self.0.rid) };
690 if HtmlError::from(rid).is_some() {
691 return None;
692 }
693 read_string_and_destroy(rid)
694 }
695
696 /// Get the literal value of this node's `class` attribute.
697 ///
698 /// For example, on `<div class="header gray">` this would return `header gray`.
699 pub fn class_name(&self) -> Option<String> {
700 let rid = unsafe { class_name(self.0.rid) };
701 if HtmlError::from(rid).is_some() {
702 return None;
703 }
704 read_string_and_destroy(rid)
705 }
706
707 /// Test if this element has a class. Case insensitive.
708 pub fn has_class<T: AsRef<str>>(&self, class_name: T) -> bool {
709 let class_name = class_name.as_ref();
710 unsafe { has_class(self.0.rid, class_name.as_ptr(), class_name.len()) }
711 }
712
713 /// Add a class name to this element's class attribute.
714 pub fn add_class<T: AsRef<str>>(&mut self, class_name: T) -> Result<(), HtmlError> {
715 let class_name = class_name.as_ref();
716 let result = unsafe { add_class(self.0.rid, class_name.as_ptr(), class_name.len()) };
717
718 if let Some(error) = HtmlError::from(result) {
719 Err(error)
720 } else {
721 Ok(())
722 }
723 }
724
725 /// Remove a class name from this element's class attribute.
726 pub fn remove_class<T: AsRef<str>>(&mut self, class_name: T) -> Result<(), HtmlError> {
727 let class_name = class_name.as_ref();
728 let result = unsafe { remove_class(self.0.rid, class_name.as_ptr(), class_name.len()) };
729
730 if let Some(error) = HtmlError::from(result) {
731 Err(error)
732 } else {
733 Ok(())
734 }
735 }
736
737 /// Test if this element has an attribute. Case insensitive.
738 pub fn has_attr<T: AsRef<str>>(&self, attr_name: T) -> bool {
739 self.0.has_attr(attr_name)
740 }
741
742 /// Set an attribute value on this element.
743 ///
744 /// If this element already has an attribute with the key, its value is updated;
745 /// otherwise, a new attribute is added.
746 pub fn set_attr<K: AsRef<str>, V: AsRef<str>>(
747 &mut self,
748 key: K,
749 value: V,
750 ) -> Result<(), HtmlError> {
751 self.0.set_attr(key, value)
752 }
753
754 /// Remove an attribute from this element.
755 pub fn remove_attr<T: AsRef<str>>(&mut self, attr: T) -> Result<(), HtmlError> {
756 self.0.remove_attr(attr)
757 }
758}
759
760impl Display for Element {
761 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
762 self.0.fmt(f)
763 }
764}
765
766impl From<Element> for Node {
767 fn from(value: Element) -> Self {
768 value.0
769 }
770}
771
772impl TryFrom<Node> for Element {
773 type Error = HtmlError;
774
775 fn try_from(value: Node) -> Result<Self, HtmlError> {
776 let kind = value.kind();
777 match kind {
778 Kind::Element => Ok(Self(value)),
779 Kind::Document => Ok(Self(value)),
780 _ => Err(HtmlError::InvalidDescriptor),
781 }
782 }
783}
784
785/// A complete HTML document.
786pub struct Document(pub(crate) Element);
787
788impl Document {
789 /// Get an instance from a [Rid].
790 pub(crate) unsafe fn from(rid: Rid) -> Self {
791 Self(unsafe { Element::from(rid) })
792 }
793
794 /// Find elements that match the given CSS (or JQuery) selector.
795 ///
796 /// <details>
797 /// <summary>Supported selectors</summary>
798 ///
799 /// | Pattern | Matches | Example |
800 /// |-------------------------|------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------|
801 /// | `*` | any element | `*` |
802 /// | `tag` | elements with the given tag name | `div` |
803 /// | <code>*\|E</code> | elements of type E in any namespace (including non-namespaced) | <code>*\|name</code> finds `<fb:name>` and `<name>` elements |
804 /// | <code>ns\|E</code> | elements of type E in the namespace ns | <code>fb\|name</code> finds `<fb:name>` elements |
805 /// | `#id` | elements with attribute ID of "id" | `div#wrap`, `#logo` |
806 /// | `.class` | elements with a class name of "class" | `div.left`, `.result` |
807 /// | `[attr]` | elements with an attribute named "attr" (with any value) | `a[href]`, `[title]` |
808 /// | `[^attrPrefix]` | elements with an attribute name starting with "attrPrefix". Use to find elements with HTML5 datasets | `[^data-]`, `div[^data-]` |
809 /// | `[attr=val]` | elements with an attribute named "attr", and value equal to "val" | `img[width=500]`, `a[rel=nofollow]` |
810 /// | `[attr="val"]` | elements with an attribute named "attr", and value equal to "val" | `span[hello="Cleveland"][goodbye="Columbus"]`, `a[rel="nofollow"]`|
811 /// | `[attr^=valPrefix]` | elements with an attribute named "attr", and value starting with "valPrefix" | `a[href^=http:]` |
812 /// | `[attr$=valSuffix]` | elements with an attribute named "attr", and value ending with "valSuffix" | `img[src$=.png]` |
813 /// | `[attr*=valContaining]` | elements with an attribute named "attr", and value containing "valContaining" | `a[href*=/search/]` |
814 /// | `[attr~=regex]` | elements with an attribute named "attr", and value matching the regular expression | `img[src~=(?i)\\.(png\|jpe?g)]` |
815 /// | | The above may be combined in any order | `div.header[title]` |
816 ///
817 /// ## Combinators
818 /// | Pattern | Matches | Example |
819 /// |-----------|-------------------------------------------------|-----------------------------|
820 /// | `E F` | an F element descended from an E element | `div a`, `.logo h1` |
821 /// | `E > F` | an F direct child of E | `ol > li` |
822 /// | `E + F` | an F element immediately preceded by sibling E | `li + li`, `div.head + div` |
823 /// | `E ~ F` | an F element preceded by sibling E | `h1 ~ p` |
824 /// | `E, F, G` | all matching elements E, F, or G | `a[href], div, h3` |
825 ///
826 /// ## Pseudo selectors
827 /// | Pattern | Matches | Example |
828 /// |----------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------|
829 /// | `:lt(n)` | elements whose sibling index is less than n | `td:lt(3)` finds the first 3 cells of each row |
830 /// | `:gt(n)` | elements whose sibling index is greater than n | `td:gt(1)` finds cells after skipping the first two |
831 /// | `:eq(n)` | elements whose sibling index is equal to n | `td:eq(0)` finds the first cell of each row |
832 /// | `:has(selector)` | elements that contains at least one element matching the selector | `div:has(p)` finds divs that contain p elements; `div:has(> a)` selects div elements that have at least one direct child a element. |
833 /// | `:not(selector)` | elements that do not match the selector. | `div:not(.logo)` finds all divs that do not have the "logo" class; `div:not(:has(div))` finds divs that do not contain divs. |
834 /// | `:contains(text)` | elements that contains the specified text. The search is case insensitive. The text may appear in the found element, or any of its descendants. | `p:contains(SwiftSoup)` finds p elements containing the text "SwiftSoup"; `p:contains(hello \(there\))` finds p elements containing the text "Hello (There)" |
835 /// | `:matches(regex)` | elements whose text matches the specified regular expression. The text may appear in the found element, or any of its descendants. | `td:matches(\\d+)` finds table cells containing digits. div:matches((?i)login) finds divs containing the text, case insensitively. |
836 /// | `:containsOwn(text)` | elements that directly contain the specified text. The search is case insensitive. The text must appear in the found element, not any of its descendants. | `p:containsOwn(SwiftSoup)` finds p elements with own text "SwiftSoup". |
837 /// | `:matchesOwn(regex)` | elements whose own text matches the specified regular expression. The text must appear in the found element, not any of its descendants. | `td:matchesOwn(\\d+)` finds table cells directly containing digits. div:matchesOwn((?i)login) finds divs containing the text, case insensitively. |
838 ///
839 /// ## Structural pseudo-selectors
840 /// | Pattern | Matches | Example |
841 /// |---------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------|
842 /// | `:root` | The element that is the root of the document. In HTML, this is the html element | | |
843 /// | `:nth-child(an+b)` | elements that have an+b-1 siblings before it in the document tree, for any positive integer or zero value of n, and has a parent element. For values of a and b greater than zero, this effectively divides the element's children into groups of a elements (the last group taking the remainder), and selecting the bth element of each group. For example, this allows the selectors to address every other row in a table, and could be used to alternate the color of paragraph text in a cycle of four. The a and b values must be integers (positive, negative, or zero). The index of the first child of an element is 1. | |
844 /// | `:nth-last-child(an+b)` | elements that have an+b-1 siblings after it in the document tree. Otherwise like `:nth-child()` | `tr:nth-last-child(-n+2)` the last two rows of a table |
845 /// | `:nth-of-type(an+b)` | pseudo-class notation represents an element that has an+b-1 siblings with the same expanded element name before it in the document tree, for any zero or positive integer value of n, and has a parent element | `img:nth-of-type(2n+1)` |
846 /// | `:nth-last-of-type(an+b)` | pseudo-class notation represents an element that has an+b-1 siblings with the same expanded element name after it in the document tree, for any zero or positive integer value of n, and has a parent element | `img:nth-last-of-type(2n+1)` |
847 /// | `:first-child` | elements that are the first child of some other element. | `div > p:first-child` |
848 /// | `:last-child` | elements that are the last child of some other element. | `ol > li:last-child` |
849 /// | `:first-of-type` | elements that are the first sibling of its type in the list of children of its parent element | `dl dt:first-of-type` |
850 /// | `:last-of-type` | elements that are the last sibling of its type in the list of children of its parent element | `tr > td:last-of-type` |
851 /// | `:only-child` | elements that have a parent element and whose parent element hasve no other element children | |
852 /// | `:only-of-type` | an element that has a parent element and whose parent element has no other element children with the same expanded element name | |
853 /// | `:empty` | elements that have no children at all | |
854 /// </details>
855 pub fn select<T: AsRef<str>>(&self, css_query: T) -> Option<ElementList> {
856 self.0.select(css_query)
857 }
858
859 /// Find the first element that matches the given CSS (or JQuery) selector.
860 pub fn select_first<T: AsRef<str>>(&self, css_query: T) -> Option<Element> {
861 self.0.select_first(css_query)
862 }
863}
864
865impl From<Document> for Element {
866 fn from(value: Document) -> Self {
867 value.0
868 }
869}
870
871impl TryFrom<Node> for Document {
872 type Error = HtmlError;
873
874 fn try_from(value: Node) -> Result<Self, HtmlError> {
875 let kind = value.kind();
876 match kind {
877 Kind::Document => Ok(Self(Element(value))),
878 _ => Err(HtmlError::InvalidDescriptor),
879 }
880 }
881}
882
883impl TryFrom<Element> for Document {
884 type Error = HtmlError;
885
886 fn try_from(value: Element) -> Result<Self, HtmlError> {
887 let kind = value.kind();
888 match kind {
889 Kind::Document => Ok(Self(value)),
890 _ => Err(HtmlError::InvalidDescriptor),
891 }
892 }
893}
894
895/// A collection of HTML elements.
896pub struct ElementList {
897 rid: Rid,
898 lower_bound: usize,
899 upper_bound: usize,
900 size: usize,
901}
902
903impl ElementList {
904 /// Get an instance from a [Rid].
905 unsafe fn from(rid: Rid) -> Self {
906 let size = unsafe { size(rid) as usize };
907 Self {
908 rid,
909 lower_bound: 0,
910 upper_bound: size.wrapping_sub(1),
911 size,
912 }
913 }
914
915 /// Find elements that match the given CSS (or JQuery) selector.
916 pub fn select<T: AsRef<str>>(&self, css_query: T) -> Option<ElementList> {
917 let query = css_query.as_ref();
918 let rid = unsafe { select(self.rid, query.as_ptr(), query.len()) };
919 if HtmlError::from(rid).is_some() {
920 return None;
921 }
922 Some(unsafe { ElementList::from(rid) })
923 }
924
925 /// Find the first element that matches the given CSS (or JQuery) selector.
926 pub fn select_first<T: AsRef<str>>(&self, css_query: T) -> Option<Element> {
927 let query = css_query.as_ref();
928 let rid = unsafe { select_first(self.rid, query.as_ptr(), query.len()) };
929 if HtmlError::from(rid).is_some() {
930 return None;
931 }
932 Some(unsafe { Element::from(rid) })
933 }
934
935 /// Get the normalized, combined text of these elements and their children.
936 ///
937 /// See [Element::text].
938 pub fn text(&self) -> Option<String> {
939 let rid = unsafe { text(self.rid) };
940 if HtmlError::from(rid).is_some() {
941 return None;
942 }
943 read_string_and_destroy(rid)
944 }
945
946 /// Get the text of these elements and their children.
947 ///
948 /// See [Element::untrimmed_text].
949 pub fn untrimmed_text(&self) -> Option<String> {
950 let rid = unsafe { untrimmed_text(self.rid) };
951 if HtmlError::from(rid).is_some() {
952 return None;
953 }
954 read_string_and_destroy(rid)
955 }
956
957 /// Get the combined elements' inner HTML.
958 ///
959 /// See [Element::html].
960 pub fn html(&self) -> Option<String> {
961 let rid = unsafe { html(self.rid) };
962 if HtmlError::from(rid).is_some() {
963 return None;
964 }
965 read_string_and_destroy(rid)
966 }
967
968 /// Get the combined elements' outer HTML.
969 ///
970 /// See [Element::outer_html].
971 pub fn outer_html(&self) -> Option<String> {
972 let rid = unsafe { outer_html(self.rid) };
973 if HtmlError::from(rid).is_some() {
974 return None;
975 }
976 read_string_and_destroy(rid)
977 }
978
979 /// Remove each element from the DOM.
980 pub fn remove(self) {
981 _ = unsafe { remove(self.rid) };
982 }
983
984 /// Get the first element of this element list.
985 pub fn first(&self) -> Option<Element> {
986 let rid = unsafe { first(self.rid) };
987 if HtmlError::from(rid).is_some() {
988 return None;
989 }
990 Some(unsafe { Element::from(rid) })
991 }
992
993 /// Get the last element of this element list.
994 pub fn last(&self) -> Option<Element> {
995 let rid = unsafe { last(self.rid) };
996 if HtmlError::from(rid).is_some() {
997 return None;
998 }
999 Some(unsafe { Element::from(rid) })
1000 }
1001
1002 /// Get the element at the given index.
1003 pub fn get(&self, index: usize) -> Option<Element> {
1004 let rid = unsafe { html_get(self.rid, index) };
1005 if HtmlError::from(rid).is_some() {
1006 return None;
1007 }
1008 Some(unsafe { Element::from(rid) })
1009 }
1010
1011 /// Get the size of this element list.
1012 pub fn size(&self) -> usize {
1013 self.size
1014 }
1015
1016 /// Check if this element list is empty.
1017 pub fn is_empty(&self) -> bool {
1018 self.size() == 0
1019 }
1020}
1021
1022impl Iterator for ElementList {
1023 type Item = Element;
1024
1025 fn next(&mut self) -> Option<Self::Item> {
1026 if self.lower_bound > self.upper_bound || self.upper_bound == usize::MAX {
1027 return None;
1028 }
1029 let value_ref = self.get(self.lower_bound);
1030 self.lower_bound += 1;
1031 value_ref
1032 }
1033}
1034
1035impl DoubleEndedIterator for ElementList {
1036 fn next_back(&mut self) -> Option<Self::Item> {
1037 if self.lower_bound > self.upper_bound || self.upper_bound == usize::MAX {
1038 return None;
1039 }
1040 let value_ref = self.get(self.upper_bound);
1041 self.upper_bound = self.upper_bound.wrapping_sub(1);
1042 value_ref
1043 }
1044}
1045
1046impl Drop for ElementList {
1047 fn drop(&mut self) {
1048 unsafe { destroy(self.rid) }
1049 }
1050}