1 module uim.html.parser; 2 3 import uim.html; 4 5 size_t[] posOfAll(string text, string searchTxt) { 6 size_t[] results; 7 8 if (searchTxt.length > 0) { 9 size_t currentPos = 0; 10 while ((currentPos < text.length) && (currentPos >= 0)) 11 { 12 currentPos = text.indexOf(searchTxt, currentPos); // looking for first or next pos 13 if (currentPos == -1) 14 break; // not found 15 16 results ~= currentPos; // found pos and add to results 17 currentPos += searchTxt.length; 18 } 19 } 20 21 return results; 22 } 23 24 size_t[] posOfAll(string text, string searchTxt, size_t startPos, size_t endPos) { 25 size_t[] results; 26 27 if (searchTxt.length > 0) { 28 size_t currentPos = startPos; 29 while ((currentPos < endPos) && (currentPos >= 0)) { 30 currentPos = text.indexOf(searchTxt, currentPos); // looking for first or next pos 31 if (currentPos == -1) 32 break; // not found 33 34 if (currentPos < endPos) results ~= currentPos; // found pos and add to results 35 currentPos += searchTxt.length; 36 } 37 } 38 39 return results; 40 } 41 unittest { 42 writeln("x x x x".posOfAll(" ", 2, 6)); 43 assert("x x x x".posOfAll(" ", 2, 6) == [3, 5]); 44 } 45 46 string fillWith(string txt, string addTxt, size_t startPos, size_t endPos) { 47 string result = txt; 48 49 for (size_t i = startPos; (i < endPos) && (i < result.length); i++) { 50 result = result[0 .. i] ~ addTxt ~ result[i + addTxt.length .. $]; 51 } 52 53 return result; 54 } 55 56 class DH5Node 57 { 58 string txt; 59 bool isRoot = true; 60 bool isContent = false; 61 bool isStartTag = false; 62 bool isEndTag = false; 63 bool hasEndTag = true; 64 size_t startPos; 65 size_t startEnd; 66 67 string _tag; 68 size_t level; 69 DH5Node[] nodes; 70 DH5Node rootNode; 71 72 string _id; 73 string[] _classes; 74 string[string] _attributes; 75 76 this() { 77 } 78 79 this(DH5Node myRoot) { 80 this(); 81 rootNode = myRoot; 82 } 83 84 this(string myTagName) { 85 this(); 86 tag = myTagName; 87 } 88 89 this(DH5Node myRoot, string myTagName) { 90 this(); 91 rootNode = myRoot; 92 tag = myTagName; 93 } 94 95 @property auto tag() { return _tag; } 96 @property void tag(string newTag) { _tag = newTag.strip; } 97 string tagToH5() { return `H5%s`.format(capitalize(_tag)); } 98 99 @property auto id() { return _id; } 100 @property void id(string newId) { _id = newId.strip; } 101 string idToH5() { return `"%s"`.format(_id); } 102 103 @property auto classes() { return _classes; } 104 @property void classes(string newClasses) { 105 auto items = newClasses.split(" "); 106 foreach (ref item; items) { item = item.strip; } 107 108 _classes = []; 109 foreach(item; items) if (item.length > 0) _classes ~= item; 110 } 111 string classesToH5() { 112 if (_classes.length == 0) return null; 113 114 string[] results; 115 foreach (c; _classes) results ~= `"%s"`.format(c); 116 return "[" ~ results.join(", ") ~ "]"; 117 } 118 119 @property auto attributes() { return _attributes; } 120 @property void attributes(string[string] newAttributes) { 121 _attributes = newAttributes; 122 if ("id" in _attributes) { 123 id = _attributes ["id"]; _attributes.remove("id"); 124 } 125 if ("class" in _attributes) { 126 classes = _attributes ["class"]; _attributes.remove("class"); 127 } 128 } 129 130 string attributesToH5() { 131 string[] results; 132 foreach (k, v; _attributes) 133 if (k.length > 0) 134 results ~= `"%s":%s`.format(k, (v.indexOf(`"`) > -1 ? v : `"` ~ v ~ `"`)); 135 return "[" ~ results.join(",") ~ "]"; 136 } 137 138 void setNodes(DH5Node[] newNodes) { 139 nodes = null; 140 auto min = minLevel(newNodes); 141 // writeln("MinLevel = ", min, " --------------"); 142 143 foreach (node; newNodes) 144 { 145 // writeln("%s - %s ".format(node.level, node).indent(node.level*2)); 146 } 147 148 DH5Node levelNode; 149 DH5Node[] subNodes; 150 foreach (node; newNodes) 151 { 152 if (node.level == min) { 153 if ((node.isContent) || (node.isStartTag && node.isEndTag)) 154 { // single node 155 subNodes = null; 156 nodes ~= node; 157 } 158 else if (node.isStartTag) 159 { // start dode 160 subNodes = null; 161 levelNode = node; 162 } 163 else if (node.isEndTag) 164 { // end Node 165 levelNode.setNodes(subNodes); 166 nodes ~= levelNode; 167 subNodes = null; 168 } 169 } 170 else 171 { 172 subNodes ~= node; 173 } 174 } 175 } 176 177 DH5Obj[] toH5() { 178 if (isRoot) 179 { 180 DH5Obj[] results; 181 foreach (node; nodes) { 182 results ~= node.toH5; 183 } 184 return results; 185 } 186 else 187 { 188 if (isContent) 189 return [H5String(txt)]; 190 if (isStartTag && isEndTag) 191 return [H5Obj(_attributes).tag(tag).single(true)]; 192 return [H5Obj(_attributes).tag(tag).content(nodes.toH5)]; 193 } 194 } 195 196 string toH5String() { 197 if (isRoot) 198 return nodes.toH5String; 199 else 200 { 201 if (isContent) 202 return `"` ~ txt ~ `"`; 203 string[] tagContent; 204 if (id.length > 0) 205 tagContent ~= idToH5; 206 if (classes.length > 0) 207 tagContent ~= classesToH5; 208 if (attributes.length > 0) 209 tagContent ~= attributesToH5; 210 // writeln(tagContent); 211 if (isStartTag && isEndTag) 212 return "H5%s(%s)".format(tag.capitalize, tagContent.join(",")); 213 if (nodes.length > 0) 214 tagContent ~= nodes.toH5String; 215 return "H5%s(%s)".format(tag.capitalize, tagContent.join(",")); 216 } 217 } 218 } 219 220 auto H5Node() { 221 return new DH5Node; 222 } 223 224 auto H5Node(DH5Node myRoot) { 225 return new DH5Node(myRoot); 226 } 227 228 auto H5Node(string myTagName) { 229 return new DH5Node(myTagName); 230 } 231 232 auto H5Node(DH5Node myRoot, string myTagName) { 233 return new DH5Node(myRoot, myTagName); 234 } 235 236 DH5Obj[] toH5(DH5Node[] someNodes) { 237 DH5Obj[] results; 238 foreach (node; someNodes) 239 results ~= node.toH5; 240 return results; 241 } 242 243 auto toH5String(DH5Node[] someNodes) { 244 string[] results; 245 foreach (node; someNodes) 246 results ~= node.toH5String; 247 return results.join(","); 248 } 249 250 size_t minLevel(DH5Node[] newNodes) { 251 size_t result; 252 if (newNodes.length == 0) 253 return -1; 254 result = newNodes[0].level; 255 if (newNodes.length == 1) 256 return result; 257 258 foreach (node; newNodes[1 .. $]) 259 if (node.level < result) 260 result = node.level; 261 return result; 262 } 263 264 DH5Node parse(string html) { 265 DH5Node rootNode = H5Node(html); 266 267 auto level1Items = html.replace("\n", "").split("<")[1 .. $]; 268 foreach (ref item; level1Items) 269 item = "<" ~ item; 270 271 string[] level2Items; 272 foreach (ref item; level1Items) { 273 auto items = item.split(">"); 274 foreach (ref it; items) 275 { 276 if (it.strip.length == 0) 277 continue; 278 if (it.indexOf("<") > -1) 279 level2Items ~= it ~ ">"; 280 else 281 level2Items ~= it; 282 } 283 } 284 285 DH5Node[] newNodes; 286 foreach (value; level2Items) { 287 DH5Node node; 288 auto v = value.strip.toLower; 289 if (v.indexOf("<") == -1) 290 node.isContent = true; 291 else if (v.indexOf("</") > -1) 292 node.isEndTag = true; 293 else 294 { 295 node.isStartTag = true; 296 if (v.indexOf("/>") > -1) 297 node.isEndTag = true; 298 if (v.indexOf("< ") > -1) 299 node.isEndTag = true; 300 if (v.indexOf("<!doctype") > -1) 301 node.isEndTag = true; 302 if (v.indexOf("<img") > -1) 303 node.isEndTag = true; 304 if (v.indexOf("<br") > -1) 305 node.isEndTag = true; 306 if (v.indexOf("<meta") > -1) 307 node.isEndTag = true; 308 if (v.indexOf("<link") > -1) 309 node.isEndTag = true; 310 } 311 if (node.isStartTag || node.isEndTag || node.isContent) 312 node.isRoot = false; 313 node.txt = value; 314 315 if (node.isStartTag) 316 { 317 // <xxx> -> xxx / <xxx a="b"> -> xxx a="b" 318 node.tag = node.txt.strip.replace("<", "").replace(">", "").split(" ")[0]; 319 // if () 320 auto atts = node.txt.strip.replace("<", "").replace(">", "").split(" "); 321 /* if (atts.length > 1) 322 foreach (a; atts[1 .. $]) node.attribute(a); 323 */ } 324 else if (node.isEndTag) 325 node.tag = node.txt.strip.replace("</", "").replace(">", ""); 326 327 newNodes ~= node; 328 } 329 330 int counter = 0; 331 foreach (ref node; newNodes) { 332 if (node.isContent) 333 { 334 node.level = counter; 335 continue; 336 } 337 else if ((node.isStartTag) && (node.isEndTag)) 338 { 339 node.level = counter; 340 continue; 341 } 342 else if (node.isStartTag) 343 { 344 node.level = counter; 345 counter++; 346 continue; 347 } 348 else if (node.isEndTag) 349 { 350 counter--; 351 node.level = counter; 352 continue; 353 } 354 } 355 356 rootNode.setNodes(newNodes); 357 return rootNode; 358 } 359 360 unittest 361 { 362 // // writeln(DH5Node(`<thead><tr><th width="1%">Product</th><th>Quantity</th><th>Price</th></tr><tr><th width="1%">Product</th><th>Quantity</th><th>Price</th></tr></thead>`)); 363 // // writeln(DH5Node(`<tr><th width="1%">Product</th><th>Quantity</th><th>Price</th></tr><tr><th width="1%">Product</th><th>Quantity</th><th>Price</th></tr>`)); 364 //// writeln(DH5Node(`<thead><tr><th width="1%">Product</th><th>Quantity</th><th>Price</th></tr><tr><th width="1%">Product</th><th>Quantity</th><th>Price</th></tr></thead>`).toH5.toPretty); 365 366 } 367 368 auto parse2(string html) { 369 auto startComment = "<!--"; 370 auto endComment = "-->"; 371 372 // Step 0: Delete End of Lines 373 auto hLines = html.split("\n"); 374 string[] newLines; 375 foreach(ref line; hLines) { 376 line = line.strip; 377 if (line.length > 0) newLines ~= line; 378 } 379 auto htmlBase = newLines.join(""); 380 381 // Step 1: Cut out comments 382 debug writeln("Step 1: Cut out comments"); 383 size_t commentPos; 384 while (htmlBase.indexOf(startComment, commentPos) > -1) { // has comment <!-- 385 auto startCPos = htmlBase.indexOf(startComment, commentPos); 386 if (htmlBase.indexOf(endComment, startCPos+startComment.length) > -1) { 387 auto endCPos = htmlBase.indexOf(endComment, startCPos+startComment.length); 388 htmlBase = htmlBase[0..startCPos]~htmlBase[endCPos+endComment.length..$]; 389 } 390 else { 391 htmlBase = htmlBase[0..startCPos]; 392 break; 393 } 394 commentPos = startCPos; 395 } 396 397 // Step 2: Read quotes 398 string htmlMask = htmlBase; 399 debug writeln("Step 2: Read quotes"); 400 writeln("1"); 401 htmlMask = htmlMask.replace(`\"`, "ss"); // Replace "false" quotes with chars; 402 writeln("2"); 403 auto stringIndicators = htmlMask.posOfAll(`"`); // Now we have all quotes 404 writeln("3"); 405 for (size_t i = 0; i < stringIndicators.length - 1; i += 2) { 406 htmlMask = htmlMask.fillWith("s", stringIndicators[i] + 1, stringIndicators[i + 1]); 407 } 408 409 // Step 3: Looking for tag limiter every < has a > if well-formed 410 debug writeln("Step 3: Looking for tag limiter"); 411 auto ltIndicators = htmlMask.posOfAll(`<`); // All < 412 auto gtIndicators = htmlMask.posOfAll(`>`); // All > 413 414 // Step 4: Set Nodes and level 415 DH5Node[] nodes; 416 DH5Node rootNode = H5Node; 417 size_t[string] tagLevels; 418 size_t level; 419 DH5Node fokusNode; 420 foreach (index, value; ltIndicators) { 421 DH5Node node = H5Node(rootNode); 422 node.isRoot = false; 423 424 if (index >= gtIndicators.length) 425 { // Not well-formed :-( 426 break; 427 } 428 429 string tag = htmlMask[ltIndicators[index] .. gtIndicators[index] + 1]; // Should be <tag> or </tag> or <tag/> 430 auto tagSpaces = posOfAll(htmlMask, " ", ltIndicators[index], gtIndicators[index]-1); 431 writeln("tag;", tag); 432 writeln(tagSpaces); 433 434 auto spacesSeparated = tag.split(" "); // separate in spaces if exists 435 auto tagName = ""; 436 node.level = level; 437 if (tag.indexOf("</") == 0) 438 { // is EndTag 439 node.isEndTag = true; 440 tagName = spacesSeparated[0].replace("</", "").replace(">", "").strip; 441 rootNode = rootNode.rootNode; 442 if (level > 0) 443 level--; 444 } 445 else { 446 node.isStartTag = true; 447 tagName = spacesSeparated[0].replace("<", "").replace(">", "").strip; 448 switch (tagName) { 449 case "link", "meta", "img", "input", "br": 450 rootNode.nodes ~= node; 451 break; // same level 452 default: 453 rootNode.nodes ~= node; 454 rootNode = node; 455 level++; 456 break; 457 } 458 459 string[] parameters; 460 string[string] attributes; 461 if (tagSpaces) { 462 writeln("T:", htmlBase[ltIndicators[index]+1..tagSpaces[0]]); 463 for(size_t spaceI = 0; spaceI < tagSpaces.length-1; spaceI += 2) { 464 parameters ~= htmlBase[tagSpaces[spaceI] .. tagSpaces[spaceI+1]].strip; 465 } 466 parameters ~= htmlBase[tagSpaces[$-1]..gtIndicators[index]].strip; 467 writeln(parameters); 468 469 foreach(para; parameters) { 470 if (para.indexOf("=") == -1) { 471 attributes[para] = para; 472 } 473 else { 474 auto pos = para.indexOf("="); 475 attributes[para[0..pos]] = para[pos+2..$-1]; 476 } 477 } 478 writeln(attributes); 479 if ("id" in attributes) { node.id(attributes["id"]); attributes.remove("id"); } 480 if ("class" in attributes) { node.classes(attributes["class"]); attributes.remove("class"); } 481 node.attributes = attributes; 482 } 483 node.level = level; 484 } 485 node.tag(tagName); 486 487 if ((index < gtIndicators.length) && (index < ltIndicators.length - 1)) 488 { 489 auto content = htmlBase[gtIndicators[index] + 1 .. ltIndicators[index + 1]]; 490 if (content) { 491 DH5Node sNode = H5Node(rootNode); 492 sNode.isRoot = false; 493 sNode.isContent = true; 494 sNode.txt = content; 495 sNode.level = level; 496 497 rootNode.nodes ~= sNode; 498 } 499 } 500 } 501 502 return rootNode; 503 } 504 505 unittest 506 { 507 /* // writeln(`<tag a d>`); 508 // writeln(parse2(`<tag a d>`)); 509 // writeln(`<tag a="b" d="c">`); 510 // writeln(parse2(`<tag a="b" d="c">`)); 511 // writeln(`<tag>`); 512 // writeln(parse2(`<tag>`)); 513 */ 514 auto nodes = parse2(readText("html.txt")); 515 // writeln(nodeToH5(nodes.nodes,0)); 516 517 auto f = File("h5.txt", "w"); // open for writing 518 f.write(nodeToH5(nodes.nodes,0)); 519 } 520 521 void writelnNodes(DH5Node[] nodes) { 522 foreach (node; nodes) { 523 // writeln(mul(" ", node.level), "\t", node.level, ":", node.tagToH5, "(", "H5String(\"" ~ node.txt.strip ~ "\")"); 524 if (node.nodes) 525 writelnNodes(node.nodes); 526 } 527 } 528 529 string intender(string txt, size_t multiple, size_t step = 2) { 530 string result = txt; 531 532 for (auto i = 0; i < multiple*step; i++) result = " "~result; 533 534 return result; 535 } 536 537 string nodeToH5(DH5Node[] nodes, size_t level) { 538 string[] results; 539 540 if ((nodes.length ==1) && nodes[0].isContent) { 541 results ~= ((nodes[0].txt.length > 0) ? "\"%s\"".format(nodes[0].txt) : ""); 542 nodes = []; 543 } 544 else { 545 foreach (node; nodes) { 546 auto result = ""; 547 if ((node.isContent) && (node.txt.length > 0)) result = nodeToH5(node, level); 548 else result = nodeToH5(node, level); 549 550 if (result.length > 0) results ~= intender("\n"~result, level); 551 } 552 } 553 return results.join(",").replace(",)", ")").replace(",]", "]").replace("\"\",", ","); 554 /* auto xxxLines = xxx.split("\n"); 555 string[] newLines; 556 foreach(line; xxxLines) if (line.strip.length > 0) newLines ~= line; // use only not empty lines 557 return newLines.join("\n"); 558 */} 559 560 string nodeToH5(DH5Node node, size_t level) { 561 string result; 562 563 if (node.isStartTag) { 564 string[] vals; 565 566 if (node.id.strip.length > 0) vals ~= node.idToH5; 567 if (node.classes.length > 0) vals ~= node.classesToH5; 568 if (node.attributes) { 569 string[] ats; 570 foreach (k, v; node.attributes) { 571 ats ~= "\"%s\":\"%s\"".format(k, v.strip); 572 } 573 if (ats) vals ~= "["~ats.join(",")~"]"; 574 } 575 if (node.nodes) vals ~= nodeToH5(node.nodes, level+1); 576 else if (node.txt.length > 0) vals ~= "\""~node.txt~"\""; 577 578 result ~= intender("H5"~capitalize(node.tag)~"(%s\n".format(vals.join(","))~")", to!size_t(level)); 579 } 580 else if ((node.isContent) && (node.txt.length > 0)) { 581 result ~= intender(node.txt.length > 0 ? "H5String(\"%s\")".format(node.txt) : "", to!size_t(level)); 582 } 583 584 return result; 585 }