lxml解析html的不同实例化类的比较

lxml.etree._Element 对象

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
from lxml import etree


string = '<div class="post" id="123"><p class="para">abc<a href="/to-go">link</a></p></div>'
doc = etree.HTML(string)



# 查看方法
print([i for i in dir(doc) if not i.startswith('_')])
print(type(doc))


# ['addnext', 'addprevious', 'append', 'attrib', 'base', 'clear', 'cssselect', 'extend', 'find', 'findall', 'findtext', 'get', #'getchildren', 'getiterator', 'getnext', 'getparent', 'getprevious', 'getroottree', 'index', 'insert', 'items', 'iter', #'iterancestors', 'iterchildren', 'iterdescendants', 'iterfind', 'itersiblings', 'itertext', 'keys', 'makeelement', 'nsmap', #'prefix', 'remove', 'replace', 'set', 'sourceline', 'tag', 'tail', 'text', 'values', 'xpath']


# <class 'lxml.etree._Element'>

lxml.etree._Element 的文档地址在这里 lxml.etree._Element 文档

lxml.html.HtmlElement 对象

1
2
3
4
5
6
7
8
9
from lxml import html
string = '<div class="post" id="123"><p class="para">abc<a href="/to-go">link</a></p></div>'
doc = html.fromstring(string)

print([i for i in dir(doc) if not i.startswith('_')] )
print(type(doc))
#['addnext', 'addprevious', 'append', 'attrib', 'base', 'base_url', 'body', 'classes', 'clear', 'cssselect', 'drop_tag', 'drop_tree', #'extend', 'find', 'find_class', 'find_rel_links', 'findall', 'findtext', 'forms', 'get', 'get_element_by_id', 'getchildren', #'getiterator', 'getnext', 'getparent', 'getprevious', 'getroottree', 'head', 'index', 'insert', 'items', 'iter', 'iterancestors', #'iterchildren', 'iterdescendants', 'iterfind', 'iterlinks', 'itersiblings', 'itertext', 'keys', 'label', 'make_links_absolute', #'makeelement', 'nsmap', 'prefix', 'remove', 'replace', 'resolve_base_href', 'rewrite_links', 'set', 'sourceline', 'tag', 'tail', #'text', 'text_content', 'values', 'xpath']

#<class 'lxml.html.HtmlElement'>

lxml.etree._ElementTree

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
from io import BytesIO
from lxml import etree

string = """<div class="post" id="123"><p class="para">abc<a href="/to-go">link</a></p></div>'"""
bytes_string = bytes(string,'utf-8')

html2 = etree.parse(BytesIO(bytes_string),etree.HTMLParser())


print([i for i in dir(html2) if not i.startswith('_')] )
print(type(html2))
# ['docinfo', 'find', 'findall', 'findtext', 'getelementpath', 'getiterator', 'getpath', 'getroot', 'iter', 'iterfind', 'parse', #'parser', 'relaxng', 'write', 'write_c14n', 'xinclude', 'xmlschema', 'xpath', 'xslt']

#<class 'lxml.etree._ElementTree'>

遍历所有元素的元素绝对Xpath定位

1
2
3
4
5
6
7
8
9
10
from lxml import html
import lxml

string = '<div class="post" id="123"><p class="para">abc<a href="/to-go">link</a></p></div>'
doc = html.fromstring(string)
alist = doc.xpath("//a")
for a in alist:
tree = lxml.etree.ElementTree(a)
print(tree.getpath(a))

比较结论

lxml.html.HtmlElementlxml.etree._Element 的子类,又继承了HtmlMinin 类,是对lxml.etree._Element 的补充

lxml.etree._Elementlxml.html.HtmlElement 的关系在官方文档 地址

其中的继承图如下

image-20221125161605643

简书上有篇文章可以参考讲解的是 lxml.html.HtmlElement 的用法 传送门


lxml解析html的不同实例化类的比较
https://kingjem.github.io/2024/10/14/lxml解析html的不同实例化类的比较/
作者
Ruhai
发布于
2024年10月14日
许可协议