"""A simple but complete HTML to Abstract Syntax Tree (AST) parser.The AST can also reproduce the HTML text.Example:: >> text = '<div class="note"><p>text</p></div>' >> ast = tokenize_html(text) >> list(ast.walk(include_self=True)) [Root(''), Tag('div', {'class': 'note'}), Tag('p'), Data('text')] >> str(ast) '<div class="note"><p>text</p></div>' >> str(ast[0][0]) '<p>text</p>'Note: optional tags are not accounted for(see https://html.spec.whatwg.org/multipage/syntax.html#optional-tags)"""from__future__importannotationsimportinspectimportitertoolsfromcollectionsimportabc,dequefromcollections.abcimportCallable,Iterable,Iteratorfromhtml.parserimportHTMLParserfromtypingimportAny
[文档]classAttribute(dict):"""This class holds the tags's attributes."""def__getitem__(self,key:str)->str:"""If self doesn't have the key it returns ''."""returnself.get(key,"")@propertydefclasses(self)->list[str]:"""Return 'class' attribute as list."""returnself["class"].split()def__str__(self)->str:"""Return a htmlized representation for attributes."""return" ".join(f'{key}="{value}"'forkey,valueinself.items())
[文档]classElement(abc.MutableSequence):"""An Element of the xml/html document. All xml/html entities inherit from this class. """def__init__(self,name:str="",attr:dict|None=None)->None:"""Initialise the element."""self.name=nameself.attrs:Attribute=Attribute(attror{})self._parent:Element|None=Noneself._children:list[Element]=[]@propertydefparent(self)->Element|None:"""Return parent."""returnself._parent@propertydefchildren(self)->list[Element]:"""Return copy of children."""returnself._children[:]
[文档]defreset_children(self,children:list[Element],deepcopy:bool=False):new_children=[]fori,iteminenumerate(children):assertisinstance(item,Element)ifdeepcopy:item=item.deepcopy()ifitem._parentisNone:item._parent=selfelifitem._parent!=self:raiseAssertionError(f"different parent already set for item {i}")new_children.append(item)self._children=new_children
[文档]definsert(self,index:int,item:Element):assertisinstance(item,Element)ifitem._parentisnotNoneanditem._parent!=self:raiseAssertionError(f"different parent already set for: {item!r}")item._parent=selfreturnself._children.insert(index,item)
[文档]defdeepcopy(self)->Element:"""Recursively copy and remove parent."""_copy=self.__class__(self.name,self.attrs)forchildinself:_copy_child=child.deepcopy()_copy.append(_copy_child)return_copy
[文档]defrender(self,tag_overrides:dict[str,Callable[[Element,dict],str]]|None=None,**kwargs,)->str:"""Returns a HTML string representation of the element. :param tag_overrides: Provide a dictionary of render function for specific tag names, to override the normal render format """raiseNotImplementedError
[文档]defwalk(self,include_self:bool=False)->Iterator[Element]:"""Walk through the xml/html AST."""ifinclude_self:yieldselfforchildinself:yieldchildyield fromchild.walk()
[文档]defstrip(self,inplace:bool=False,recurse:bool=False)->Element:"""Return copy with all `Data` tokens that only contain whitespace / newlines removed. """element=selfifnotinplace:element=self.deepcopy()element.reset_children([eforeinelement.childrenifnot(isinstance(e,Data)ande.data.strip()=="")])ifrecurse:forchildinelement:child.strip(inplace=True,recurse=True)returnelement
[文档]deffind(self,identifier:str|type[Element],attrs:dict|None=None,classes:Iterable[str]|None=None,include_self:bool=False,recurse:bool=True,)->Iterator[Element]:"""Find all elements that match name and specific attributes."""iterator=self.walk()ifrecurseelseselfifinclude_self:iterator=itertools.chain([self],iterator)test_func=((lambdac:isinstance(c,identifier))ifinspect.isclass(identifier)elselambdac:c.name==identifier)classes=set(classes)ifclassesisnotNoneelseclassesforchildiniterator:iftest_func(child):ifclassesisnotNoneandnotclasses.issubset(child.attrs.classes):continueforkey,valuein(attrsor{}).items():ifchild.attrs[key]!=value:breakelse:yieldchild
[文档]classRoot(Element):"""The root of the AST tree."""
[文档]defrender(self,**kwargs)->str:# type: ignore[override]"""Returns a string HTML representation of the structure."""return"".join(child.render(**kwargs)forchildinself)
[文档]classTag(Element):"""Represent xml/html tags under the form: <name key="value" ...> ... </name>."""
[文档]classTree:"""The engine class to generate the AST tree."""def__init__(self,name:str=""):"""Initialise Tree"""self.name=nameself.outmost=Root(name)self.stack:deque=deque()self.stack.append(self.outmost)
[文档]defclear(self):"""Clear the outmost and stack for a new parsing."""self.outmost=Root(self.name)self.stack.clear()self.stack.append(self.outmost)
[文档]deflast(self)->Element:"""Return the last pointer which point to the actual tag scope."""returnself.stack[-1]
[文档]defnest_tag(self,name:str,attrs:dict):"""Nest a given tag at the bottom of the tree using the last stack's pointer. """pointer=self.stack.pop()item=Tag(name,attrs)pointer.append(item)self.stack.append(pointer)self.stack.append(item)
[文档]defnest_xtag(self,name:str,attrs:dict):"""Nest an XTag onto the tree."""top=self.last()item=XTag(name,attrs)top.append(item)
[文档]defnest_vtag(self,name:str,attrs:dict):"""Nest a VoidTag onto the tree."""top=self.last()item=VoidTag(name,attrs)top.append(item)
[文档]defnest_terminal(self,klass:type[TerminalElement],data:str):"""Nest the data onto the tree."""top=self.last()item=klass(data)top.append(item)
[文档]defenclose(self,name:str):"""When a closing tag is found, pop the pointer's scope from the stack, to then point to the earlier scope's tag. """count=0forindinreversed(self.stack):count=count+1ifind.name==name:breakelse:count=0# It pops all the items which do not match with the closing tag.for_inrange(count):self.stack.pop()
[文档]classHtmlToAst(HTMLParser):"""The tokenizer class."""# see https://html.spec.whatwg.org/multipage/syntax.html#void-elementsvoid_elements={"area","base","br","col","embed","hr","img","input","link","meta","param","source","track","wbr",}def__init__(self,name:str="",convert_charrefs:bool=False):super().__init__(convert_charrefs=convert_charrefs)self.struct=Tree(name)
[文档]deffeed(self,source:str)->Root:# type: ignore[override]"""Parse the source string."""self.struct.clear()super().feed(source)returnself.struct.outmost
[文档]defhandle_starttag(self,name:str,attr):"""When found an opening tag then nest it onto the tree."""ifnameinself.void_elements:self.struct.nest_vtag(name,attr)else:self.struct.nest_tag(name,attr)
[文档]defhandle_startendtag(self,name:str,attr):"""When found a XHTML tag style then nest it up to the tree."""self.struct.nest_xtag(name,attr)
[文档]defhandle_endtag(self,name:str):"""When found a closing tag then makes it point to the right scope."""ifnamenotinself.void_elements:self.struct.enclose(name)
[文档]defhandle_data(self,data:str):"""Nest data onto the tree."""self.struct.nest_terminal(Data,data)