182 lines
		
	
	
		
			7.4 KiB
		
	
	
	
		
			PHP
		
	
	
		
		
			
		
	
	
			182 lines
		
	
	
		
			7.4 KiB
		
	
	
	
		
			PHP
		
	
	
|  | <?php | ||
|  | 
 | ||
|  | /** | ||
|  |  * Takes a well formed list of tokens and fixes their nesting. | ||
|  |  * | ||
|  |  * HTML elements dictate which elements are allowed to be their children, | ||
|  |  * for example, you can't have a p tag in a span tag.  Other elements have | ||
|  |  * much more rigorous definitions: tables, for instance, require a specific | ||
|  |  * order for their elements.  There are also constraints not expressible by | ||
|  |  * document type definitions, such as the chameleon nature of ins/del | ||
|  |  * tags and global child exclusions. | ||
|  |  * | ||
|  |  * The first major objective of this strategy is to iterate through all | ||
|  |  * the nodes and determine whether or not their children conform to the | ||
|  |  * element's definition.  If they do not, the child definition may | ||
|  |  * optionally supply an amended list of elements that is valid or | ||
|  |  * require that the entire node be deleted (and the previous node | ||
|  |  * rescanned). | ||
|  |  * | ||
|  |  * The second objective is to ensure that explicitly excluded elements of | ||
|  |  * an element do not appear in its children.  Code that accomplishes this | ||
|  |  * task is pervasive through the strategy, though the two are distinct tasks | ||
|  |  * and could, theoretically, be seperated (although it's not recommended). | ||
|  |  * | ||
|  |  * @note Whether or not unrecognized children are silently dropped or | ||
|  |  *       translated into text depends on the child definitions. | ||
|  |  * | ||
|  |  * @todo Enable nodes to be bubbled out of the structure.  This is | ||
|  |  *       easier with our new algorithm. | ||
|  |  */ | ||
|  | 
 | ||
|  | class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy | ||
|  | { | ||
|  | 
 | ||
|  |     /** | ||
|  |      * @param HTMLPurifier_Token[] $tokens | ||
|  |      * @param HTMLPurifier_Config $config | ||
|  |      * @param HTMLPurifier_Context $context | ||
|  |      * @return array|HTMLPurifier_Token[] | ||
|  |      */ | ||
|  |     public function execute($tokens, $config, $context) | ||
|  |     { | ||
|  | 
 | ||
|  |         //####################################################################//
 | ||
|  |         // Pre-processing
 | ||
|  | 
 | ||
|  |         // O(n) pass to convert to a tree, so that we can efficiently
 | ||
|  |         // refer to substrings
 | ||
|  |         $top_node = HTMLPurifier_Arborize::arborize($tokens, $config, $context); | ||
|  | 
 | ||
|  |         // get a copy of the HTML definition
 | ||
|  |         $definition = $config->getHTMLDefinition(); | ||
|  | 
 | ||
|  |         $excludes_enabled = !$config->get('Core.DisableExcludes'); | ||
|  | 
 | ||
|  |         // setup the context variable 'IsInline', for chameleon processing
 | ||
|  |         // is 'false' when we are not inline, 'true' when it must always
 | ||
|  |         // be inline, and an integer when it is inline for a certain
 | ||
|  |         // branch of the document tree
 | ||
|  |         $is_inline = $definition->info_parent_def->descendants_are_inline; | ||
|  |         $context->register('IsInline', $is_inline); | ||
|  | 
 | ||
|  |         // setup error collector
 | ||
|  |         $e =& $context->get('ErrorCollector', true); | ||
|  | 
 | ||
|  |         //####################################################################//
 | ||
|  |         // Loop initialization
 | ||
|  | 
 | ||
|  |         // stack that contains all elements that are excluded
 | ||
|  |         // it is organized by parent elements, similar to $stack,
 | ||
|  |         // but it is only populated when an element with exclusions is
 | ||
|  |         // processed, i.e. there won't be empty exclusions.
 | ||
|  |         $exclude_stack = array($definition->info_parent_def->excludes); | ||
|  | 
 | ||
|  |         // variable that contains the start token while we are processing
 | ||
|  |         // nodes. This enables error reporting to do its job
 | ||
|  |         $node = $top_node; | ||
|  |         // dummy token
 | ||
|  |         list($token, $d) = $node->toTokenPair(); | ||
|  |         $context->register('CurrentNode', $node); | ||
|  |         $context->register('CurrentToken', $token); | ||
|  | 
 | ||
|  |         //####################################################################//
 | ||
|  |         // Loop
 | ||
|  | 
 | ||
|  |         // We need to implement a post-order traversal iteratively, to
 | ||
|  |         // avoid running into stack space limits.  This is pretty tricky
 | ||
|  |         // to reason about, so we just manually stack-ify the recursive
 | ||
|  |         // variant:
 | ||
|  |         //
 | ||
|  |         //  function f($node) {
 | ||
|  |         //      foreach ($node->children as $child) {
 | ||
|  |         //          f($child);
 | ||
|  |         //      }
 | ||
|  |         //      validate($node);
 | ||
|  |         //  }
 | ||
|  |         //
 | ||
|  |         // Thus, we will represent a stack frame as array($node,
 | ||
|  |         // $is_inline, stack of children)
 | ||
|  |         // e.g. array_reverse($node->children) - already processed
 | ||
|  |         // children.
 | ||
|  | 
 | ||
|  |         $parent_def = $definition->info_parent_def; | ||
|  |         $stack = array( | ||
|  |             array($top_node, | ||
|  |                   $parent_def->descendants_are_inline, | ||
|  |                   $parent_def->excludes, // exclusions
 | ||
|  |                   0) | ||
|  |             ); | ||
|  | 
 | ||
|  |         while (!empty($stack)) { | ||
|  |             list($node, $is_inline, $excludes, $ix) = array_pop($stack); | ||
|  |             // recursive call
 | ||
|  |             $go = false; | ||
|  |             $def = empty($stack) ? $definition->info_parent_def : $definition->info[$node->name]; | ||
|  |             while (isset($node->children[$ix])) { | ||
|  |                 $child = $node->children[$ix++]; | ||
|  |                 if ($child instanceof HTMLPurifier_Node_Element) { | ||
|  |                     $go = true; | ||
|  |                     $stack[] = array($node, $is_inline, $excludes, $ix); | ||
|  |                     $stack[] = array($child, | ||
|  |                         // ToDo: I don't think it matters if it's def or
 | ||
|  |                         // child_def, but double check this...
 | ||
|  |                         $is_inline || $def->descendants_are_inline, | ||
|  |                         empty($def->excludes) ? $excludes | ||
|  |                                               : array_merge($excludes, $def->excludes), | ||
|  |                         0); | ||
|  |                     break; | ||
|  |                 } | ||
|  |             }; | ||
|  |             if ($go) continue; | ||
|  |             list($token, $d) = $node->toTokenPair(); | ||
|  |             // base case
 | ||
|  |             if ($excludes_enabled && isset($excludes[$node->name])) { | ||
|  |                 $node->dead = true; | ||
|  |                 if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node excluded'); | ||
|  |             } else { | ||
|  |                 // XXX I suppose it would be slightly more efficient to
 | ||
|  |                 // avoid the allocation here and have children
 | ||
|  |                 // strategies handle it
 | ||
|  |                 $children = array(); | ||
|  |                 foreach ($node->children as $child) { | ||
|  |                     if (!$child->dead) $children[] = $child; | ||
|  |                 } | ||
|  |                 $result = $def->child->validateChildren($children, $config, $context); | ||
|  |                 if ($result === true) { | ||
|  |                     // nop
 | ||
|  |                     $node->children = $children; | ||
|  |                 } elseif ($result === false) { | ||
|  |                     $node->dead = true; | ||
|  |                     if ($e) $e->send(E_ERROR, 'Strategy_FixNesting: Node removed'); | ||
|  |                 } else { | ||
|  |                     $node->children = $result; | ||
|  |                     if ($e) { | ||
|  |                         // XXX This will miss mutations of internal nodes. Perhaps defer to the child validators
 | ||
|  |                         if (empty($result) && !empty($children)) { | ||
|  |                             $e->send(E_ERROR, 'Strategy_FixNesting: Node contents removed'); | ||
|  |                         } else if ($result != $children) { | ||
|  |                             $e->send(E_WARNING, 'Strategy_FixNesting: Node reorganized'); | ||
|  |                         } | ||
|  |                     } | ||
|  |                 } | ||
|  |             } | ||
|  |         } | ||
|  | 
 | ||
|  |         //####################################################################//
 | ||
|  |         // Post-processing
 | ||
|  | 
 | ||
|  |         // remove context variables
 | ||
|  |         $context->destroy('IsInline'); | ||
|  |         $context->destroy('CurrentNode'); | ||
|  |         $context->destroy('CurrentToken'); | ||
|  | 
 | ||
|  |         //####################################################################//
 | ||
|  |         // Return
 | ||
|  | 
 | ||
|  |         return HTMLPurifier_Arborize::flatten($node, $config, $context); | ||
|  |     } | ||
|  | } | ||
|  | 
 | ||
|  | // vim: et sw=4 sts=4
 |