HTML.php 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468
  1. <?php
  2. /**
  3. * PHPExcel
  4. *
  5. * Copyright (c) 2006 - 2014 PHPExcel
  6. *
  7. * This library is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * This library is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with this library; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. *
  21. * @category PHPExcel
  22. * @package PHPExcel_Reader
  23. * @copyright Copyright (c) 2006 - 2014 PHPExcel (http://www.codeplex.com/PHPExcel)
  24. * @license http://www.gnu.org/licenses/old-licenses/lgpl-2.1.txt LGPL
  25. * @version 1.8.0, 2014-03-02
  26. */
  27. /** PHPExcel root directory */
  28. if (!defined('PHPEXCEL_ROOT')) {
  29. /**
  30. * @ignore
  31. */
  32. define('PHPEXCEL_ROOT', dirname(__FILE__) . '/../../');
  33. require(PHPEXCEL_ROOT . 'PHPExcel/Autoloader.php');
  34. }
  35. /**
  36. * PHPExcel_Reader_HTML
  37. *
  38. * @category PHPExcel
  39. * @package PHPExcel_Reader
  40. * @copyright Copyright (c) 2006 - 2014 PHPExcel (http://www.codeplex.com/PHPExcel)
  41. */
  42. class PHPExcel_Reader_HTML extends PHPExcel_Reader_Abstract implements PHPExcel_Reader_IReader
  43. {
  44. /**
  45. * Input encoding
  46. *
  47. * @var string
  48. */
  49. private $_inputEncoding = 'ANSI';
  50. /**
  51. * Sheet index to read
  52. *
  53. * @var int
  54. */
  55. private $_sheetIndex = 0;
  56. /**
  57. * Formats
  58. *
  59. * @var array
  60. */
  61. private $_formats = array( 'h1' => array( 'font' => array( 'bold' => true,
  62. 'size' => 24,
  63. ),
  64. ), // Bold, 24pt
  65. 'h2' => array( 'font' => array( 'bold' => true,
  66. 'size' => 18,
  67. ),
  68. ), // Bold, 18pt
  69. 'h3' => array( 'font' => array( 'bold' => true,
  70. 'size' => 13.5,
  71. ),
  72. ), // Bold, 13.5pt
  73. 'h4' => array( 'font' => array( 'bold' => true,
  74. 'size' => 12,
  75. ),
  76. ), // Bold, 12pt
  77. 'h5' => array( 'font' => array( 'bold' => true,
  78. 'size' => 10,
  79. ),
  80. ), // Bold, 10pt
  81. 'h6' => array( 'font' => array( 'bold' => true,
  82. 'size' => 7.5,
  83. ),
  84. ), // Bold, 7.5pt
  85. 'a' => array( 'font' => array( 'underline' => true,
  86. 'color' => array( 'argb' => PHPExcel_Style_Color::COLOR_BLUE,
  87. ),
  88. ),
  89. ), // Blue underlined
  90. 'hr' => array( 'borders' => array( 'bottom' => array( 'style' => PHPExcel_Style_Border::BORDER_THIN,
  91. 'color' => array( PHPExcel_Style_Color::COLOR_BLACK,
  92. ),
  93. ),
  94. ),
  95. ), // Bottom border
  96. );
  97. /**
  98. * Create a new PHPExcel_Reader_HTML
  99. */
  100. public function __construct() {
  101. $this->_readFilter = new PHPExcel_Reader_DefaultReadFilter();
  102. }
  103. /**
  104. * Validate that the current file is an HTML file
  105. *
  106. * @return boolean
  107. */
  108. protected function _isValidFormat()
  109. {
  110. // Reading 2048 bytes should be enough to validate that the format is HTML
  111. $data = fread($this->_fileHandle, 2048);
  112. if ((strpos($data, '<') !== FALSE) &&
  113. (strlen($data) !== strlen(strip_tags($data)))) {
  114. return TRUE;
  115. }
  116. return FALSE;
  117. }
  118. /**
  119. * Loads PHPExcel from file
  120. *
  121. * @param string $pFilename
  122. * @return PHPExcel
  123. * @throws PHPExcel_Reader_Exception
  124. */
  125. public function load($pFilename)
  126. {
  127. // Create new PHPExcel
  128. $objPHPExcel = new PHPExcel();
  129. // Load into this instance
  130. return $this->loadIntoExisting($pFilename, $objPHPExcel);
  131. }
  132. /**
  133. * Set input encoding
  134. *
  135. * @param string $pValue Input encoding
  136. */
  137. public function setInputEncoding($pValue = 'ANSI')
  138. {
  139. $this->_inputEncoding = $pValue;
  140. return $this;
  141. }
  142. /**
  143. * Get input encoding
  144. *
  145. * @return string
  146. */
  147. public function getInputEncoding()
  148. {
  149. return $this->_inputEncoding;
  150. }
  151. // Data Array used for testing only, should write to PHPExcel object on completion of tests
  152. private $_dataArray = array();
  153. private $_tableLevel = 0;
  154. private $_nestedColumn = array('A');
  155. private function _setTableStartColumn($column) {
  156. if ($this->_tableLevel == 0)
  157. $column = 'A';
  158. ++$this->_tableLevel;
  159. $this->_nestedColumn[$this->_tableLevel] = $column;
  160. return $this->_nestedColumn[$this->_tableLevel];
  161. }
  162. private function _getTableStartColumn() {
  163. return $this->_nestedColumn[$this->_tableLevel];
  164. }
  165. private function _releaseTableStartColumn() {
  166. --$this->_tableLevel;
  167. return array_pop($this->_nestedColumn);
  168. }
  169. private function _flushCell($sheet,$column,$row,&$cellContent) {
  170. if (is_string($cellContent)) {
  171. // Simple String content
  172. if (trim($cellContent) > '') {
  173. // Only actually write it if there's content in the string
  174. // echo 'FLUSH CELL: ' , $column , $row , ' => ' , $cellContent , '<br />';
  175. // Write to worksheet to be done here...
  176. // ... we return the cell so we can mess about with styles more easily
  177. $cell = $sheet->setCellValue($column.$row,$cellContent,true);
  178. $this->_dataArray[$row][$column] = $cellContent;
  179. }
  180. } else {
  181. // We have a Rich Text run
  182. // TODO
  183. $this->_dataArray[$row][$column] = 'RICH TEXT: ' . $cellContent;
  184. }
  185. $cellContent = (string) '';
  186. }
  187. private function _processDomElement(DOMNode $element, $sheet, &$row, &$column, &$cellContent){
  188. foreach($element->childNodes as $child){
  189. if ($child instanceof DOMText) {
  190. $domText = preg_replace('/\s+/',' ',trim($child->nodeValue));
  191. if (is_string($cellContent)) {
  192. // simply append the text if the cell content is a plain text string
  193. $cellContent .= $domText;
  194. } else {
  195. // but if we have a rich text run instead, we need to append it correctly
  196. // TODO
  197. }
  198. } elseif($child instanceof DOMElement) {
  199. // echo '<b>DOM ELEMENT: </b>' , strtoupper($child->nodeName) , '<br />';
  200. $attributeArray = array();
  201. foreach($child->attributes as $attribute) {
  202. // echo '<b>ATTRIBUTE: </b>' , $attribute->name , ' => ' , $attribute->value , '<br />';
  203. $attributeArray[$attribute->name] = $attribute->value;
  204. }
  205. switch($child->nodeName) {
  206. case 'meta' :
  207. foreach($attributeArray as $attributeName => $attributeValue) {
  208. switch($attributeName) {
  209. case 'content':
  210. // TODO
  211. // Extract character set, so we can convert to UTF-8 if required
  212. break;
  213. }
  214. }
  215. $this->_processDomElement($child,$sheet,$row,$column,$cellContent);
  216. break;
  217. case 'title' :
  218. $this->_processDomElement($child,$sheet,$row,$column,$cellContent);
  219. $sheet->setTitle($cellContent);
  220. $cellContent = '';
  221. break;
  222. case 'span' :
  223. case 'div' :
  224. case 'font' :
  225. case 'i' :
  226. case 'em' :
  227. case 'strong':
  228. case 'b' :
  229. // echo 'STYLING, SPAN OR DIV<br />';
  230. if ($cellContent > '')
  231. $cellContent .= ' ';
  232. $this->_processDomElement($child,$sheet,$row,$column,$cellContent);
  233. if ($cellContent > '')
  234. $cellContent .= ' ';
  235. // echo 'END OF STYLING, SPAN OR DIV<br />';
  236. break;
  237. case 'hr' :
  238. $this->_flushCell($sheet,$column,$row,$cellContent);
  239. ++$row;
  240. if (isset($this->_formats[$child->nodeName])) {
  241. $sheet->getStyle($column.$row)->applyFromArray($this->_formats[$child->nodeName]);
  242. } else {
  243. $cellContent = '----------';
  244. $this->_flushCell($sheet,$column,$row,$cellContent);
  245. }
  246. ++$row;
  247. case 'br' :
  248. if ($this->_tableLevel > 0) {
  249. // If we're inside a table, replace with a \n
  250. $cellContent .= "\n";
  251. } else {
  252. // Otherwise flush our existing content and move the row cursor on
  253. $this->_flushCell($sheet,$column,$row,$cellContent);
  254. ++$row;
  255. }
  256. // echo 'HARD LINE BREAK: ' , '<br />';
  257. break;
  258. case 'a' :
  259. // echo 'START OF HYPERLINK: ' , '<br />';
  260. foreach($attributeArray as $attributeName => $attributeValue) {
  261. switch($attributeName) {
  262. case 'href':
  263. // echo 'Link to ' , $attributeValue , '<br />';
  264. $sheet->getCell($column.$row)->getHyperlink()->setUrl($attributeValue);
  265. if (isset($this->_formats[$child->nodeName])) {
  266. $sheet->getStyle($column.$row)->applyFromArray($this->_formats[$child->nodeName]);
  267. }
  268. break;
  269. }
  270. }
  271. $cellContent .= ' ';
  272. $this->_processDomElement($child,$sheet,$row,$column,$cellContent);
  273. // echo 'END OF HYPERLINK:' , '<br />';
  274. break;
  275. case 'h1' :
  276. case 'h2' :
  277. case 'h3' :
  278. case 'h4' :
  279. case 'h5' :
  280. case 'h6' :
  281. case 'ol' :
  282. case 'ul' :
  283. case 'p' :
  284. if ($this->_tableLevel > 0) {
  285. // If we're inside a table, replace with a \n
  286. $cellContent .= "\n";
  287. // echo 'LIST ENTRY: ' , '<br />';
  288. $this->_processDomElement($child,$sheet,$row,$column,$cellContent);
  289. // echo 'END OF LIST ENTRY:' , '<br />';
  290. } else {
  291. if ($cellContent > '') {
  292. $this->_flushCell($sheet,$column,$row,$cellContent);
  293. $row += 2;
  294. }
  295. // echo 'START OF PARAGRAPH: ' , '<br />';
  296. $this->_processDomElement($child,$sheet,$row,$column,$cellContent);
  297. // echo 'END OF PARAGRAPH:' , '<br />';
  298. $this->_flushCell($sheet,$column,$row,$cellContent);
  299. if (isset($this->_formats[$child->nodeName])) {
  300. $sheet->getStyle($column.$row)->applyFromArray($this->_formats[$child->nodeName]);
  301. }
  302. $row += 2;
  303. $column = 'A';
  304. }
  305. break;
  306. case 'li' :
  307. if ($this->_tableLevel > 0) {
  308. // If we're inside a table, replace with a \n
  309. $cellContent .= "\n";
  310. // echo 'LIST ENTRY: ' , '<br />';
  311. $this->_processDomElement($child,$sheet,$row,$column,$cellContent);
  312. // echo 'END OF LIST ENTRY:' , '<br />';
  313. } else {
  314. if ($cellContent > '') {
  315. $this->_flushCell($sheet,$column,$row,$cellContent);
  316. }
  317. ++$row;
  318. // echo 'LIST ENTRY: ' , '<br />';
  319. $this->_processDomElement($child,$sheet,$row,$column,$cellContent);
  320. // echo 'END OF LIST ENTRY:' , '<br />';
  321. $this->_flushCell($sheet,$column,$row,$cellContent);
  322. $column = 'A';
  323. }
  324. break;
  325. case 'table' :
  326. $this->_flushCell($sheet,$column,$row,$cellContent);
  327. $column = $this->_setTableStartColumn($column);
  328. // echo 'START OF TABLE LEVEL ' , $this->_tableLevel , '<br />';
  329. if ($this->_tableLevel > 1)
  330. --$row;
  331. $this->_processDomElement($child,$sheet,$row,$column,$cellContent);
  332. // echo 'END OF TABLE LEVEL ' , $this->_tableLevel , '<br />';
  333. $column = $this->_releaseTableStartColumn();
  334. if ($this->_tableLevel > 1) {
  335. ++$column;
  336. } else {
  337. ++$row;
  338. }
  339. break;
  340. case 'thead' :
  341. case 'tbody' :
  342. $this->_processDomElement($child,$sheet,$row,$column,$cellContent);
  343. break;
  344. case 'tr' :
  345. ++$row;
  346. $column = $this->_getTableStartColumn();
  347. $cellContent = '';
  348. // echo 'START OF TABLE ' , $this->_tableLevel , ' ROW<br />';
  349. $this->_processDomElement($child,$sheet,$row,$column,$cellContent);
  350. // echo 'END OF TABLE ' , $this->_tableLevel , ' ROW<br />';
  351. break;
  352. case 'th' :
  353. case 'td' :
  354. // echo 'START OF TABLE ' , $this->_tableLevel , ' CELL<br />';
  355. $this->_processDomElement($child,$sheet,$row,$column,$cellContent);
  356. // echo 'END OF TABLE ' , $this->_tableLevel , ' CELL<br />';
  357. $this->_flushCell($sheet,$column,$row,$cellContent);
  358. ++$column;
  359. break;
  360. case 'body' :
  361. $row = 1;
  362. $column = 'A';
  363. $content = '';
  364. $this->_tableLevel = 0;
  365. $this->_processDomElement($child,$sheet,$row,$column,$cellContent);
  366. break;
  367. default:
  368. $this->_processDomElement($child,$sheet,$row,$column,$cellContent);
  369. }
  370. }
  371. }
  372. }
  373. /**
  374. * Loads PHPExcel from file into PHPExcel instance
  375. *
  376. * @param string $pFilename
  377. * @param PHPExcel $objPHPExcel
  378. * @return PHPExcel
  379. * @throws PHPExcel_Reader_Exception
  380. */
  381. public function loadIntoExisting($pFilename, PHPExcel $objPHPExcel)
  382. {
  383. // Open file to validate
  384. $this->_openFile($pFilename);
  385. if (!$this->_isValidFormat()) {
  386. fclose ($this->_fileHandle);
  387. throw new PHPExcel_Reader_Exception($pFilename . " is an Invalid HTML file.");
  388. }
  389. // Close after validating
  390. fclose ($this->_fileHandle);
  391. // Create new PHPExcel
  392. while ($objPHPExcel->getSheetCount() <= $this->_sheetIndex) {
  393. $objPHPExcel->createSheet();
  394. }
  395. $objPHPExcel->setActiveSheetIndex( $this->_sheetIndex );
  396. // Create a new DOM object
  397. $dom = new domDocument;
  398. // Reload the HTML file into the DOM object
  399. $loaded = $dom->loadHTMLFile($pFilename, PHPExcel_Settings::getLibXmlLoaderOptions());
  400. if ($loaded === FALSE) {
  401. throw new PHPExcel_Reader_Exception('Failed to load ',$pFilename,' as a DOM Document');
  402. }
  403. // Discard white space
  404. $dom->preserveWhiteSpace = false;
  405. $row = 0;
  406. $column = 'A';
  407. $content = '';
  408. $this->_processDomElement($dom,$objPHPExcel->getActiveSheet(),$row,$column,$content);
  409. // echo '<hr />';
  410. // var_dump($this->_dataArray);
  411. // Return
  412. return $objPHPExcel;
  413. }
  414. /**
  415. * Get sheet index
  416. *
  417. * @return int
  418. */
  419. public function getSheetIndex() {
  420. return $this->_sheetIndex;
  421. }
  422. /**
  423. * Set sheet index
  424. *
  425. * @param int $pValue Sheet index
  426. * @return PHPExcel_Reader_HTML
  427. */
  428. public function setSheetIndex($pValue = 0) {
  429. $this->_sheetIndex = $pValue;
  430. return $this;
  431. }
  432. }