1 module uim.html.parser;
2 
3 import uim.html;
4 
5 size_t[] posOfAll(string text, string searchTxt) {
6   size_t[] results;
7 
8   if (searchTxt.length > 0) {
9     size_t currentPos = 0;
10     while ((currentPos < text.length) && (currentPos >= 0))
11     {
12       currentPos = text.indexOf(searchTxt, currentPos); // looking for first or next pos
13       if (currentPos == -1)
14         break; // not found
15 
16       results ~= currentPos; // found pos and add to results      
17       currentPos += searchTxt.length;
18     }
19   }
20 
21   return results;
22 }
23 
24 size_t[] posOfAll(string text, string searchTxt, size_t startPos, size_t endPos) {
25   size_t[] results;
26 
27   if (searchTxt.length > 0) {
28     size_t currentPos = startPos;
29     while ((currentPos < endPos) && (currentPos >= 0)) {
30       currentPos = text.indexOf(searchTxt, currentPos); // looking for first or next pos
31       if (currentPos == -1)
32         break; // not found
33 
34       if (currentPos < endPos) results ~= currentPos; // found pos and add to results      
35       currentPos += searchTxt.length;
36     }
37   }
38 
39   return results;
40 }
41 unittest {
42   writeln("x x x x".posOfAll(" ", 2, 6));
43   assert("x x x x".posOfAll(" ", 2, 6) == [3, 5]);
44 }
45 
46 string fillWith(string txt, string addTxt, size_t startPos, size_t endPos) {
47   string result = txt;
48 
49   for (size_t i = startPos; (i < endPos) && (i < result.length); i++) {
50     result = result[0 .. i] ~ addTxt ~ result[i + addTxt.length .. $];
51   }
52 
53   return result;
54 }
55 
56 class DH5Node
57 {
58   string txt;
59   bool isRoot = true;
60   bool isContent = false;
61   bool isStartTag = false;
62   bool isEndTag = false;
63   bool hasEndTag = true;
64   size_t startPos;
65   size_t startEnd;
66 
67   string _tag;
68   size_t level;
69   DH5Node[] nodes;
70   DH5Node rootNode;
71 
72   string _id;
73   string[] _classes;
74   string[string] _attributes;
75 
76   this() {
77   }
78 
79   this(DH5Node myRoot) {
80     this();
81     rootNode = myRoot;
82   }
83 
84   this(string myTagName) {
85     this();
86     tag = myTagName;
87   }
88 
89   this(DH5Node myRoot, string myTagName) {
90     this();
91     rootNode = myRoot;
92     tag = myTagName;
93   }
94 
95   @property auto tag() { return _tag; }
96   @property void tag(string newTag) { _tag = newTag.strip; }
97   string tagToH5() { return `H5%s`.format(capitalize(_tag)); }
98 
99   @property auto id() { return _id; }
100   @property void id(string newId) { _id = newId.strip; }
101   string idToH5() { return `"%s"`.format(_id); }
102 
103   @property auto classes() { return _classes; }
104   @property void classes(string newClasses) {
105     auto items = newClasses.split(" ");
106     foreach (ref item; items) { item = item.strip; }
107 
108     _classes = [];
109     foreach(item; items) if (item.length > 0) _classes ~= item;    
110   }
111   string classesToH5() {
112     if (_classes.length == 0) return null;
113 
114     string[] results;
115     foreach (c; _classes) results ~= `"%s"`.format(c);
116     return "[" ~ results.join(", ") ~ "]";
117   }
118 
119   @property auto attributes() { return _attributes; }
120   @property void attributes(string[string] newAttributes) {
121     _attributes = newAttributes;
122     if ("id" in _attributes) {
123       id = _attributes ["id"]; _attributes.remove("id");
124     }
125     if ("class" in _attributes) {
126       classes = _attributes ["class"]; _attributes.remove("class");
127     }
128   }
129 
130   string attributesToH5() {
131     string[] results;
132     foreach (k, v; _attributes)
133       if (k.length > 0)
134         results ~= `"%s":%s`.format(k, (v.indexOf(`"`) > -1 ? v : `"` ~ v ~ `"`));
135     return "[" ~ results.join(",") ~ "]";
136   }
137 
138   void setNodes(DH5Node[] newNodes) {
139     nodes = null;
140     auto min = minLevel(newNodes);
141     // writeln("MinLevel = ", min, " --------------");
142 
143     foreach (node; newNodes)
144     {
145       // writeln("%s - %s ".format(node.level, node).indent(node.level*2));
146     }
147 
148     DH5Node levelNode;
149     DH5Node[] subNodes;
150     foreach (node; newNodes)
151     {
152       if (node.level == min) {
153         if ((node.isContent) || (node.isStartTag && node.isEndTag))
154         { // single node
155           subNodes = null;
156           nodes ~= node;
157         }
158         else if (node.isStartTag)
159         { // start dode 
160           subNodes = null;
161           levelNode = node;
162         }
163         else if (node.isEndTag)
164         { // end Node
165           levelNode.setNodes(subNodes);
166           nodes ~= levelNode;
167           subNodes = null;
168         }
169       }
170       else
171       {
172         subNodes ~= node;
173       }
174     }
175   }
176 
177   DH5Obj[] toH5() {
178     if (isRoot)
179     {
180       DH5Obj[] results;
181       foreach (node; nodes) {
182         results ~= node.toH5;
183       }
184       return results;
185     }
186     else
187     {
188       if (isContent)
189         return [H5String(txt)];
190       if (isStartTag && isEndTag)
191         return [H5Obj(_attributes).tag(tag).single(true)];
192       return [H5Obj(_attributes).tag(tag).content(nodes.toH5)];
193     }
194   }
195 
196   string toH5String() {
197     if (isRoot)
198       return nodes.toH5String;
199     else
200     {
201       if (isContent)
202         return `"` ~ txt ~ `"`;
203       string[] tagContent;
204       if (id.length > 0)
205         tagContent ~= idToH5;
206       if (classes.length > 0)
207         tagContent ~= classesToH5;
208       if (attributes.length > 0)
209         tagContent ~= attributesToH5;
210       // writeln(tagContent);
211       if (isStartTag && isEndTag)
212         return "H5%s(%s)".format(tag.capitalize, tagContent.join(","));
213       if (nodes.length > 0)
214         tagContent ~= nodes.toH5String;
215       return "H5%s(%s)".format(tag.capitalize, tagContent.join(","));
216     }
217   }
218 }
219 
220 auto H5Node() {
221   return new DH5Node;
222 }
223 
224 auto H5Node(DH5Node myRoot) {
225   return new DH5Node(myRoot);
226 }
227 
228 auto H5Node(string myTagName) {
229   return new DH5Node(myTagName);
230 }
231 
232 auto H5Node(DH5Node myRoot, string myTagName) {
233   return new DH5Node(myRoot, myTagName);
234 }
235 
236 DH5Obj[] toH5(DH5Node[] someNodes) {
237   DH5Obj[] results;
238   foreach (node; someNodes)
239     results ~= node.toH5;
240   return results;
241 }
242 
243 auto toH5String(DH5Node[] someNodes) {
244   string[] results;
245   foreach (node; someNodes)
246     results ~= node.toH5String;
247   return results.join(",");
248 }
249 
250 size_t minLevel(DH5Node[] newNodes) {
251   size_t result;
252   if (newNodes.length == 0)
253     return -1;
254   result = newNodes[0].level;
255   if (newNodes.length == 1)
256     return result;
257 
258   foreach (node; newNodes[1 .. $])
259     if (node.level < result)
260       result = node.level;
261   return result;
262 }
263 
264 DH5Node parse(string html) {
265   DH5Node rootNode = H5Node(html);
266 
267   auto level1Items = html.replace("\n", "").split("<")[1 .. $];
268   foreach (ref item; level1Items)
269     item = "<" ~ item;
270 
271   string[] level2Items;
272   foreach (ref item; level1Items) {
273     auto items = item.split(">");
274     foreach (ref it; items)
275     {
276       if (it.strip.length == 0)
277         continue;
278       if (it.indexOf("<") > -1)
279         level2Items ~= it ~ ">";
280       else
281         level2Items ~= it;
282     }
283   }
284 
285   DH5Node[] newNodes;
286   foreach (value; level2Items) {
287     DH5Node node;
288     auto v = value.strip.toLower;
289     if (v.indexOf("<") == -1)
290       node.isContent = true;
291     else if (v.indexOf("</") > -1)
292       node.isEndTag = true;
293     else
294     {
295       node.isStartTag = true;
296       if (v.indexOf("/>") > -1)
297         node.isEndTag = true;
298       if (v.indexOf("< ") > -1)
299         node.isEndTag = true;
300       if (v.indexOf("<!doctype") > -1)
301         node.isEndTag = true;
302       if (v.indexOf("<img") > -1)
303         node.isEndTag = true;
304       if (v.indexOf("<br") > -1)
305         node.isEndTag = true;
306       if (v.indexOf("<meta") > -1)
307         node.isEndTag = true;
308       if (v.indexOf("<link") > -1)
309         node.isEndTag = true;
310     }
311     if (node.isStartTag || node.isEndTag || node.isContent)
312       node.isRoot = false;
313     node.txt = value;
314 
315     if (node.isStartTag)
316     {
317       // <xxx> -> xxx / <xxx a="b"> -> xxx a="b"
318       node.tag = node.txt.strip.replace("<", "").replace(">", "").split(" ")[0];
319       // if ()
320       auto atts = node.txt.strip.replace("<", "").replace(">", "").split(" ");
321 /*       if (atts.length > 1)
322         foreach (a; atts[1 .. $])  node.attribute(a);
323  */    }
324     else if (node.isEndTag)
325       node.tag = node.txt.strip.replace("</", "").replace(">", "");
326 
327     newNodes ~= node;
328   }
329 
330   int counter = 0;
331   foreach (ref node; newNodes) {
332     if (node.isContent)
333     {
334       node.level = counter;
335       continue;
336     }
337     else if ((node.isStartTag) && (node.isEndTag))
338     {
339       node.level = counter;
340       continue;
341     }
342     else if (node.isStartTag)
343     {
344       node.level = counter;
345       counter++;
346       continue;
347     }
348     else if (node.isEndTag)
349     {
350       counter--;
351       node.level = counter;
352       continue;
353     }
354   }
355 
356   rootNode.setNodes(newNodes);
357   return rootNode;
358 }
359 
360 unittest
361 {
362   // // writeln(DH5Node(`<thead><tr><th width="1%">Product</th><th>Quantity</th><th>Price</th></tr><tr><th width="1%">Product</th><th>Quantity</th><th>Price</th></tr></thead>`));
363   // // writeln(DH5Node(`<tr><th width="1%">Product</th><th>Quantity</th><th>Price</th></tr><tr><th width="1%">Product</th><th>Quantity</th><th>Price</th></tr>`));
364   //// writeln(DH5Node(`<thead><tr><th width="1%">Product</th><th>Quantity</th><th>Price</th></tr><tr><th width="1%">Product</th><th>Quantity</th><th>Price</th></tr></thead>`).toH5.toPretty);
365 
366 }
367 
368 auto parse2(string html) {
369   auto startComment = "<!--";
370   auto endComment = "-->";
371 
372   // Step 0: Delete End of Lines
373   auto hLines = html.split("\n");
374   string[] newLines;
375   foreach(ref line; hLines) {
376     line = line.strip;
377     if (line.length > 0) newLines ~= line;
378   }
379   auto htmlBase = newLines.join("");
380 
381   // Step 1: Cut out comments
382   debug writeln("Step 1: Cut out comments");
383   size_t commentPos;
384   while (htmlBase.indexOf(startComment, commentPos) > -1) { // has comment <!--
385     auto startCPos = htmlBase.indexOf(startComment, commentPos);
386     if (htmlBase.indexOf(endComment, startCPos+startComment.length) > -1) {
387       auto endCPos = htmlBase.indexOf(endComment, startCPos+startComment.length);
388       htmlBase = htmlBase[0..startCPos]~htmlBase[endCPos+endComment.length..$];
389     }
390     else {
391       htmlBase = htmlBase[0..startCPos];
392       break;
393     }
394     commentPos = startCPos;
395   }
396 
397   // Step 2: Read quotes
398   string htmlMask = htmlBase;
399   debug writeln("Step 2: Read quotes");
400   writeln("1");
401   htmlMask = htmlMask.replace(`\"`, "ss"); // Replace "false" quotes with chars;
402   writeln("2");
403   auto stringIndicators = htmlMask.posOfAll(`"`); // Now we have all quotes
404   writeln("3");
405   for (size_t i = 0; i < stringIndicators.length - 1; i += 2) {
406     htmlMask = htmlMask.fillWith("s", stringIndicators[i] + 1, stringIndicators[i + 1]);
407   }
408 
409   // Step 3: Looking for tag limiter every < has a > if well-formed
410   debug writeln("Step 3: Looking for tag limiter");
411   auto ltIndicators = htmlMask.posOfAll(`<`); // All <
412   auto gtIndicators = htmlMask.posOfAll(`>`); // All >
413 
414   // Step 4: Set Nodes and level 
415   DH5Node[] nodes;
416   DH5Node rootNode = H5Node;
417   size_t[string] tagLevels;
418   size_t level;
419   DH5Node fokusNode;
420   foreach (index, value; ltIndicators) {
421     DH5Node node = H5Node(rootNode);
422     node.isRoot = false;
423 
424     if (index >= gtIndicators.length)
425     { // Not well-formed :-(
426       break;
427     }
428 
429     string tag = htmlMask[ltIndicators[index] .. gtIndicators[index] + 1]; // Should be <tag> or </tag> or <tag/>
430     auto tagSpaces = posOfAll(htmlMask, " ", ltIndicators[index], gtIndicators[index]-1);
431     writeln("tag;", tag);
432     writeln(tagSpaces);
433 
434     auto spacesSeparated = tag.split(" "); // separate in spaces if exists 
435     auto tagName = "";
436     node.level = level;
437     if (tag.indexOf("</") == 0)
438     { // is EndTag
439       node.isEndTag = true;
440       tagName = spacesSeparated[0].replace("</", "").replace(">", "").strip;
441       rootNode = rootNode.rootNode;
442       if (level > 0)
443         level--;
444     }
445     else {
446       node.isStartTag = true;
447       tagName = spacesSeparated[0].replace("<", "").replace(">", "").strip;
448       switch (tagName) {
449         case "link", "meta", "img", "input", "br":
450           rootNode.nodes ~= node;
451           break; // same level
452         default:
453           rootNode.nodes ~= node;
454           rootNode = node;
455           level++;
456           break;
457       }
458 
459       string[] parameters;
460       string[string] attributes;
461       if (tagSpaces) {  
462         writeln("T:", htmlBase[ltIndicators[index]+1..tagSpaces[0]]);
463         for(size_t spaceI = 0; spaceI < tagSpaces.length-1; spaceI += 2) {
464           parameters ~= htmlBase[tagSpaces[spaceI] .. tagSpaces[spaceI+1]].strip;
465         }
466         parameters ~= htmlBase[tagSpaces[$-1]..gtIndicators[index]].strip;
467         writeln(parameters);
468 
469         foreach(para; parameters) {
470           if (para.indexOf("=") == -1) {
471             attributes[para] = para;
472           }
473           else {
474             auto pos = para.indexOf("=");
475             attributes[para[0..pos]] = para[pos+2..$-1];
476           }
477         }
478         writeln(attributes);
479         if ("id" in attributes) { node.id(attributes["id"]); attributes.remove("id"); }
480         if ("class" in attributes) { node.classes(attributes["class"]); attributes.remove("class"); }
481         node.attributes = attributes;
482       }
483       node.level = level;
484     }
485     node.tag(tagName);
486 
487     if ((index < gtIndicators.length) && (index < ltIndicators.length - 1))
488     {
489       auto content = htmlBase[gtIndicators[index] + 1 .. ltIndicators[index + 1]];
490       if (content) {
491         DH5Node sNode = H5Node(rootNode);
492         sNode.isRoot = false;
493         sNode.isContent = true;
494         sNode.txt = content;
495         sNode.level = level;
496 
497         rootNode.nodes ~= sNode;
498       }
499     }
500   }
501 
502   return rootNode;
503 }
504 
505 unittest
506 {
507   /*   // writeln(`<tag a d>`);
508   // writeln(parse2(`<tag a d>`));
509   // writeln(`<tag a="b" d="c">`);
510   // writeln(parse2(`<tag a="b" d="c">`));
511   // writeln(`<tag>`);
512   // writeln(parse2(`<tag>`));
513  */
514   auto nodes = parse2(readText("html.txt"));
515   // writeln(nodeToH5(nodes.nodes,0));
516   
517   auto f = File("h5.txt", "w"); // open for writing
518   f.write(nodeToH5(nodes.nodes,0));
519 }
520 
521 void writelnNodes(DH5Node[] nodes) {
522   foreach (node; nodes) {
523     // writeln(mul("    ", node.level), "\t", node.level, ":", node.tagToH5, "(", "H5String(\"" ~ node.txt.strip ~ "\")");
524     if (node.nodes)
525       writelnNodes(node.nodes);
526   }
527 }
528 
529 string intender(string txt, size_t multiple, size_t step = 2) {
530   string result = txt;
531 
532   for (auto i = 0; i < multiple*step; i++) result = " "~result;
533 
534   return result;
535 }
536 
537 string nodeToH5(DH5Node[] nodes, size_t level) {
538   string[] results;
539 
540   if ((nodes.length ==1) && nodes[0].isContent) {
541     results ~= ((nodes[0].txt.length > 0) ? "\"%s\"".format(nodes[0].txt) : "");
542     nodes = [];
543   }
544   else {
545     foreach (node; nodes) {
546       auto result = "";
547       if ((node.isContent) && (node.txt.length > 0)) result = nodeToH5(node, level);
548       else result = nodeToH5(node, level);
549 
550       if (result.length > 0) results ~= intender("\n"~result, level);
551     }
552   }
553   return results.join(",").replace(",)", ")").replace(",]", "]").replace("\"\",", ",");
554 /*   auto xxxLines = xxx.split("\n"); 
555   string[] newLines;
556   foreach(line; xxxLines) if (line.strip.length > 0) newLines ~= line; // use only not empty lines
557   return newLines.join("\n");
558  */}
559 
560 string nodeToH5(DH5Node node, size_t level) {
561   string result;
562 
563   if (node.isStartTag) {
564     string[] vals;
565 
566     if (node.id.strip.length > 0) vals ~= node.idToH5;
567     if (node.classes.length > 0) vals ~= node.classesToH5;
568     if (node.attributes) {
569       string[] ats;
570       foreach (k, v; node.attributes) {
571         ats ~= "\"%s\":\"%s\"".format(k, v.strip);
572       }
573       if (ats) vals ~= "["~ats.join(",")~"]";
574     }
575     if (node.nodes) vals ~= nodeToH5(node.nodes, level+1);
576     else if (node.txt.length > 0) vals ~= "\""~node.txt~"\"";
577 
578     result ~= intender("H5"~capitalize(node.tag)~"(%s\n".format(vals.join(","))~")", to!size_t(level));
579   }
580   else if ((node.isContent) && (node.txt.length > 0)) {
581     result ~= intender(node.txt.length > 0 ? "H5String(\"%s\")".format(node.txt) : "", to!size_t(level));
582   }
583 
584   return result;
585 }