[ Index ]

PHP Cross Reference of Moodle 1.9.3 [Build 15-Oct-2008]

title

Body

[close]

/lib/ -> html2text.php (source)

   1  <?php
   2  
   3  /***************************************************************
   4   * Library to convert HTML into an approximate text equivalent *
   5   ***************************************************************
   6  
   7    Version: 1.0.3  (with modifications)
   8    Copyright 2003 Mark Wilton-Jones
   9    License: HowToCreate script license with written permission
  10    URL: http://www.howtocreate.co.uk/php/
  11  
  12    For full details about the script and to get the latest version,
  13    please see the HowToCreate web site above.
  14  
  15    This version contains modifications for Moodle.  In each case the
  16    lines are marked with "Moodle", so you can  see what has changed.
  17  
  18    ********************************************************************/
  19  
  20  function html2text( $badStr ) {
  21  
  22      $is_open_tb = false;
  23      $is_open_dq = false;
  24      $is_open_sq = false;
  25  
  26      //remove comments
  27  
  28      while (substr_count($badStr, '<!--') && 
  29             substr_count($badStr, '-->') && 
  30             strpos($badStr, '-->', strpos($badStr, '<!--' ) ) > strpos( $badStr, '<!--' ) ) {
  31             $badStr = substr( $badStr, 0, strpos( $badStr, '<!--' ) ) . 
  32                       substr( $badStr, strpos( $badStr, '-->', 
  33                       strpos( $badStr, '<!--' ) ) + 3 );
  34      }
  35  
  36      //now make sure all HTML tags are correctly written (> not in between quotes)
  37  
  38      $len = strlen($badStr); // Moodle
  39      $chr = $badStr{0}; // Moodle
  40      $goodStr = ''; // Moodle
  41  
  42      if ($len > 0) { // Moodle
  43          for ($x=0; $x < $len; $x++ ) { // Moodle
  44              $chr = $badStr{$x}; //take each letter in turn and check if that character is permitted there
  45              switch ( $chr ) {
  46                  case '<':
  47                      if ( !$is_open_tb && strtolower( substr( $badStr, $x + 1, 5 ) ) == 'style' ) {
  48                          $x = strpos( strtolower( $badStr ), '</style>', $x ) + 7; // Moodle
  49                          $chr = '';
  50                      } else if ( !$is_open_tb && strtolower( substr( $badStr, $x + 1, 6 ) ) == 'script' ) {
  51                          $x = strpos( strtolower( $badStr ), '</script>', $x ) + 8;  // Moodle
  52                          $chr = '';
  53                      } else if (!$is_open_tb) { 
  54                          $is_open_tb = true; 
  55                      } else { 
  56                          $chr = '&lt;'; 
  57                      }
  58                      break;
  59  
  60                  case '>':
  61                      if ( !$is_open_tb || $is_open_dq || $is_open_sq ) { 
  62                          $chr = '&gt;'; 
  63                      } else { 
  64                          $is_open_tb = false; 
  65                      }
  66                      break;
  67  
  68                  case '"':
  69                      if ( $is_open_tb && !$is_open_dq && !$is_open_sq ) { 
  70                          $is_open_dq = true; 
  71                      } else if ( $is_open_tb && $is_open_dq && !$is_open_sq ) { 
  72                          $is_open_dq = false; 
  73                      } else { 
  74                          $chr = '&quot;'; 
  75                      }
  76                      break;
  77  
  78                  case "'":
  79                      if ( $is_open_tb && !$is_open_dq && !$is_open_sq ) { 
  80                          $is_open_sq = true; 
  81                      } else if ( $is_open_tb && !$is_open_dq && $is_open_sq ) { 
  82                          $is_open_sq = false; 
  83                      }
  84                      break;
  85              }
  86              $goodStr .= $chr;
  87          }
  88      } // Moodle
  89  
  90      //now that the page is valid (I hope) for strip_tags, strip all unwanted tags
  91  
  92      $goodStr = strip_tags( $goodStr, '<title><hr><h1><h2><h3><h4><h5><h6><div><p><pre><sup><ul><ol><br><dl><dt><table><caption><tr><li><dd><th><td><a><area><img><form><input><textarea><button><select><option>' );
  93  
  94      //strip extra whitespace except between <pre> and <textarea> tags
  95  
  96      $badStr = preg_split( "/<\/?pre[^>]*>/i", $goodStr );
  97  
  98      for ( $x = 0; isset($badStr[$x]) && is_string( $badStr[$x] ); $x++ ) { // Moodle: added isset() test
  99          if ( $x % 2 ) { $badStr[$x] = '<pre>'.$badStr[$x].'</pre>'; } else {
 100              $goodStr = preg_split( "/<\/?textarea[^>]*>/i", $badStr[$x] );
 101              for ( $z = 0; isset($goodStr[$z]) && is_string( $goodStr[$z] ); $z++ ) { // Moodle: added isset() test
 102                  if ( $z % 2 ) { $goodStr[$z] = '<textarea>'.$goodStr[$z].'</textarea>'; } else {
 103                      $goodStr[$z] = str_replace('  ', ' ', $goodStr[$z] );
 104                  }
 105              }
 106              $badStr[$x] = implode('',$goodStr);
 107          }
 108      }
 109  
 110      $goodStr = implode('',$badStr);
 111  
 112      //remove all options from select inputs
 113  
 114      $goodStr = preg_replace( "/<option[^>]*>[^<]*/i", '', $goodStr );
 115  
 116      //replace all tags with their text equivalents
 117  
 118      $goodStr = preg_replace( "/<(\/title|hr)[^>]*>/i", "\n          --------------------\n", $goodStr );
 119  
 120      $goodStr = preg_replace( "/<(h|div|p)[^>]*>/i", "\n\n", $goodStr );
 121  
 122      $goodStr = preg_replace( "/<sup[^>]*>/i", '^', $goodStr );
 123  
 124      $goodStr = preg_replace( "/<(ul|ol|br|dl|dt|table|caption|\/textarea|tr[^>]*>\s*<(td|th))[^>]*>/i", "\n", $goodStr );
 125  
 126      $goodStr = preg_replace( "/<li[^>]*>/i", "\n� ", $goodStr );
 127  
 128      $goodStr = preg_replace( "/<dd[^>]*>/i", "\n\t", $goodStr );
 129  
 130      $goodStr = preg_replace( "/<(th|td)[^>]*>/i", "\t", $goodStr );
 131  
 132   // $goodStr = preg_replace( "/<a[^>]* href=(\"((?!\"|#|javascript:)[^\"#]*)(\"|#)|'((?!'|#|javascript:)[^'#]*)('|#)|((?!'|\"|>|#|javascript:)[^#\"'> ]*))[^>]*>/i", "[LINK: $2$4$6] ", $goodStr );   // Moodle
 133      $goodStr = preg_replace( "/<a\s[^>]*href=(\"((?!\"|#|javascript:)[^\"#]*)(\"|#)|'((?!'|#|javascript:)[^'#]*)('|#)|((?!'|\"|>|#|javascript:)[^#\"'> ]*))[^>]*>([^<]*)<\/a>/i", "$7 [$2$4$6]", $goodStr );
 134  
 135      // $goodStr = preg_replace( "/<img[^>]* alt=(\"([^\"]+)\"|'([^']+)'|([^\"'> ]+))[^>]*>/i", "[IMAGE: $2$3$4] ", $goodStr );   // Moodle
 136      $goodStr = preg_replace( "/<img[^>]* alt=(\"([^\"]+)\"|'([^']+)'|([^\"'> ]+))[^>]*>/i", "[$2$3$4] ", $goodStr );
 137  
 138      $goodStr = preg_replace( "/<form[^>]* action=(\"([^\"]+)\"|'([^']+)'|([^\"'> ]+))[^>]*>/i", "\n[FORM: $2$3$4] ", $goodStr );
 139  
 140      $goodStr = preg_replace( "/<(input|textarea|button|select)[^>]*>/i", "[INPUT] ", $goodStr );
 141  
 142      //strip all remaining tags (mostly closing tags)
 143  
 144      $goodStr = strip_tags( $goodStr );
 145  
 146      //convert HTML entities
 147  
 148      $goodStr = strtr( $goodStr, array_flip( get_html_translation_table( HTML_ENTITIES ) ) );
 149  
 150      preg_replace( "/&#(\d+);/me", "chr('$1')", $goodStr );
 151  
 152      //wordwrap
 153  
 154      // $goodStr = wordwrap( $goodStr );   // Moodle
 155      $goodStr = wordwrap( $goodStr, 78 );
 156  
 157      //make sure there are no more than 3 linebreaks in a row and trim whitespace
 158      $goodStr = preg_replace("/\r\n?|\f/", "\n", $goodStr);
 159      $goodStr = preg_replace("/\n(\s*\n){2}/", "\n\n\n", $goodStr);
 160      $goodStr = preg_replace("/[ \t]+(\n|$)/", "$1", $goodStr);
 161      $goodStr = preg_replace("/^\n*|\n*$/", '', $goodStr);
 162  
 163      return $goodStr;
 164  
 165  }
 166  
 167  ?>


Generated: Wed Jan 14 11:33:29 2009 Cross-referenced by PHPXref 0.7