-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathharvester.js
More file actions
791 lines (780 loc) · 31.4 KB
/
harvester.js
File metadata and controls
791 lines (780 loc) · 31.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
/**
* This coefficient affects level calculation. Under level I mean every recursive call of match()
* function. For every deeper or upper jump we multiply current level into this coefficient. Like
* this: Math.round(level * TREE_COMPLETE_COEF) || 1. Looks like I refound Fibonacci coefficient :)
* The level affects score for every found node by this formula:
* (node.score > 0 || score > 0) && (node.score += (score + (node.score > 0 ? maxLevel - level : 0)))
* Also, level affects how many deeper or upper nodes will be using during search. This is also
* optimization metric.
*/
const LEVEL_COEF = 1.61803398875
/**
* Number of spaces representing one indentation level.
*/
const SPACE_AMOUNT = 2
/**
* Minimum depth we are doing to search. This value is used to be more robust to the changed DOM
* (added and removed nodes) structure
*/
const MIN_DEPTH = 3
/**
* Amount of point we add if the tag is equal to which we are loking for
*/
const TAG_SCORE = 1
/**
* Amount of milliseconds harvest() function may searching for the data in a DOM. When this timeout
* is reached execution will be stopped and only found data will be returned in a map
*/
const EXECUTION_TIME = 15000
/**
* Regular expression to parse a single line of the pseudo tree-like string. Matches:
* indentation, tag name, optional text, optional text type, optional text value and
* optional attribute. Full string may look like: " div{price:float}[id=id]".
*/
const LINE_RE =
/^( *)?([a-zA-Z0-9_-]+|\*)(?:\{([a-zA-Z0-9_]+)(?::([a-z]+)(?::(.*))?)?\})?(?:\[([a-zA-Z0-9_-]+)=([a-zA-Z0-9_-]+?)\])? *$/
/**
* Special constant for the jsdom emulation. The same like Node.TEXT_NODE under browser
*/
const TEXT_NODE = 3
/**
* Two dimentional cache for score of the nodes. Contains DOM node as a first key and pseudo
* tree-like node unique id as a second key, which contains a score. It's used to optimize
* recursion process. If score of cached sub-tree is lower than maxScore there is no sense to
* traverse into this sub-node and we may skip it. Ex: const score = SCORE_CACHE.get(el).get(id).
*/
const SCORE_CACHE = new Map()
/**
* Cache for tag names. Uses DOM element as a key and tagName property as a value.
*/
const TAG_NAME_CACHE = new Map()
/**
* Cache for tags text. Uses DOM element as a key and tag text as a value.
*/
const TEXT_CACHE = new Map()
/**
* Cahe for parentNode elements. Uses DOM elements as a key and parent elements as a value.
*/
const PARENT_CACHE = new Map()
/**
* Cache for el.firstElementChild elements. Uses element as a key and firstElementChild as a value.
*/
const FIRST_CHILD_CACHE = new Map()
/**
* Cache for el.nextElementSibling elements. Uses element as a key and nextElementSibling as a value.
*/
const NEXT_CACHE = new Map()
/**
* Global identifier for pseudo tree-like nodes. Every node obtains id++ as unique identifier.
* This variable should be reset before every usege (before toTree() call).
*/
let id = -1
/**
* Reference to the root DOM element we start searching from. Shoould be set once before matching
* process is started
*/
let rootEl
/**
* Reference to the firstEl, which is passed as a second parameter to the harvester() function. We
* need it to check if algorithm is not jumping outside the first element we started search from
*/
let rootFirstEl
/**
* Harvester options.
* spaceAmount - see SPACE_AMOUNT
* executionTime - see EXECUTION_TIME
* minDepth - see MIN_DEPTH
* tagScore - see TAG_SCORE
* tagTextScore - === tagScore * minDepth * 2
* tagAttrScore - === tagScore * minDepth * 2
* tagTextTypeScore - === tagScore * minDepth * 4
* dataOnly - return only data map, without maxScore, score and metaTree
*/
let options = {}
/**
* Timestamp, when harvest() function was started to work. Is used with executionTime option
* to calculate if we have to stop searching, becaue of executionTime timeout is reached
*/
let startTime = performance.now()
/**
* Returns a cached score for DOM elements and pseudo tree-like nodes array. So if we trying to
* calculate a score for the DOM nodes and all it's sub-nodes we have to check this cache first.
* @param {Element} firstEl First DOM element we start to compare
* @param {Number} nodesId Unique id of pseudo tree-like nodes
* @returns {Number|undefined} score or undefined
*/
SCORE_CACHE.getScore = function getScore (firstEl, nodesId) {
if (this.get(firstEl) === undefined) this.set(firstEl, new Map())
return this.get(firstEl).get(nodesId)
}
/**
* Sets score value into the cache. Score is related to DOM element and template node(s) id
* @param {Element} firstEl First DOM element we start to compare
* @param {String} nodesId Unique pseudo tree-like nodes id
* @param {Number} score Score
*/
SCORE_CACHE.setScore = function setScore (firstEl, nodesId, score) {
this.get(firstEl).set(nodesId, score)
}
/**
* Returns tag name from a cache by DOM element
*/
TAG_NAME_CACHE.getName = function getName (el) {
let tagName = TAG_NAME_CACHE.get(el)
if (tagName === undefined) TAG_NAME_CACHE.set(el, tagName = el.tagName.toUpperCase())
return tagName
}
/**
* Returns cached text of the DOM element
* @param {Element} el DOM element
* @returns {String|undefined}
*/
TEXT_CACHE.getText = function getText (el) {
let t = this.get(el)
if (t === undefined) this.set(el, t = text(el))
return t
}
/**
* Returns cached parent node for DOM element "el"
* @param {Element} el DOM element whose parent we need to get
* @returns {Element|null}
*/
PARENT_CACHE.getParent = function getParent (el) {
let parentEl = this.get(el)
if (parentEl === undefined) this.set(el, parentEl = el?.parentNode)
return parentEl
}
/**
* Returns cached first child of DOM element "el"
* @param {Element} el Element whose first child we need to get
* @returns {Element|null}
*/
FIRST_CHILD_CACHE.getFirstChild = function getFirstChild (el) {
let firstChildEl = this.get(el)
if (firstChildEl === undefined) this.set(el, firstChildEl = el.firstElementChild)
return firstChildEl
}
/**
* Returns cached next sibling DOM element for "el"
* @param {Element} el DOM Element, whose next sibling we need to get
* @returns {Element|null}
*/
NEXT_CACHE.getNext = function getNext (el) {
let nextEl = this.get(el)
if (nextEl === undefined) this.set(el, nextEl = el?.nextElementSibling)
return nextEl
}
/**
* Creates all properties of harvester options object. Sets default values if not set
* @param {Object} opt Options obtained from harvest() function as a parameter
* @returns {Object} Full options object
*/
function buildOptions (opt = {}) {
!opt.completeCoef && (opt.completeCoef = LEVEL_COEF)
!opt.spaceAmount && (opt.spaceAmount = SPACE_AMOUNT)
!opt.executionTime && (opt.executionTime = EXECUTION_TIME)
!opt.minDepth && (opt.minDepth = MIN_DEPTH)
!opt.tagScore && (opt.tagScore = TAG_SCORE)
!opt.tagTextScore && (opt.tagTextScore = opt.tagScore * opt.minDepth * 2)
!opt.tagAttrScore && (opt.tagAttrScore = opt.tagScore * opt.minDepth * 2)
!opt.tagTextTypeScore && (opt.tagTextTypeScore = opt.tagScore * opt.minDepth * 4)
!opt.dataOnly && (opt.dataOnly = false)
options = opt
}
/**
* Displays an error message during parsing of the pseudo tree-like string.
* @param {String} line The current line being parsed.
* @param {Number} l Line number.
* @param {String} msg Error message.
*/
function err (line, l, msg) {
console.error(`Error in line '${line}' #:${l}. ${msg}.`)
}
/**
* Recursively converts a pseudo tree-like string into an array of nodes.
* Invalid nodes are skipped and only valid lines will be in final JSON tree.
* Full format of one line is: " tag{textTag:textType:textVal}[attrTag=attrName]".
* Example: " img{text:func:checkText}[attr=href]"
*
* @param {String[]} lines The pseudo tree-like string split into lines.
* @param {Number} l Current line index.
* @param {Object[]} nodes Array to store parsed nodes.
* @param {Number} level Current indentation level.
* @param {Number} startSpaces Left padding of the first not empty tag in a tree
* @returns {[Number, Number]} The updated line index and level difference.
*/
function parse (lines, l, nodes, level, startSpaces = -1) {
for (let i = l; i < lines.length; i++) {
const line = lines[i]
if (!line || line.trim() === '') continue
// 1: spaces, 2: tag, 3: textTag, 4: text type, 5: text value, 6: attrTag, 7: attrName
const m = line.match(LINE_RE)
if (!m) { err(line, i, 'Wrong line format'); continue }
const spaces = m[1]?.length || 0
if (spaces % options.spaceAmount !== 0) {
err(line, i, `Wrong left indentation. Must be a multiple of ${options.spaceAmount}`)
continue
}
if (startSpaces < 0) startSpaces = spaces
if ((spaces - startSpaces) % options.spaceAmount !== 0) {
err(line, i, `Wrong left indentation. Must be a multiple of ${options.spaceAmount}`)
continue
}
const curLevel = (spaces - startSpaces) / options.spaceAmount
if (curLevel < 0) { err(line, i, 'Wrong left indentation level'); continue }
if (curLevel > level && curLevel - level > 1) {
err(line, i, 'Wrong left indentation level')
continue
}
if (m[6] && !m[7]) {
err(line, i, 'Wrong attribute format. Should be [attrTag=attrName]')
continue
}
if (curLevel === level) {
const node = { id: id++, tag: m[2].toUpperCase(), maxScore: 0 }
m[3] && (node.textTag = m[3])
m[4] && (node.textType = m[4])
m[5] && (node.textVal = m[5])
m[6] && (node.attrTag = [m[6], m[7]])
nodes.push(node)
} else if (curLevel > level) {
if (!nodes.length) { err(line, i, 'Wrong left indentation level'); continue }
nodes[nodes.length - 1].children = []
let ret
[i, ret] = parse(lines, i, nodes[nodes.length - 1].children, level + 1, startSpaces)
if (ret) return [i, ret - 1]
} else return [i - 1, level - curLevel - 1]
}
return [lines.length, 0]
}
/**
* Converts a pseudo tree-like string into a JSON tree. Only valid nodes will be parsed.
* @param {String} tpl The pseudo tree-like string in format:
*
* div
* span
* *{h1}
* img{text}[src=src]
*
* @returns {Object[]} Parsed tree structure.
*/
function toTree (tpl) {
const lines = tpl.toString().split('\n')
const nodes = []
id = 0
parse(lines, 0, nodes, 0)
return nodes
}
/**
* Checks if a val is a string
* @param {*} val
* @returns {Boolean}
*/
function isStr (val) {
return typeof val === 'string' || val instanceof String
}
/**
* Checks if a value is a non-null object.
* @param {*} val Value to check.
* @returns {Boolean} True if val is an object, false otherwise.
*/
function isObj (val) {
return typeof val === 'object' && !Array.isArray(val) && val !== null
}
/**
* Checks if a string is a float number after type cast
* @param {String} str String to check
* @returns {Boolean}
*/
function isFloat (str) {
const num = +str
return str !== '' && !Number.isNaN(num) && !Number.isInteger(num)
}
/**
* Checks if a string is an integer number after type cast
* @param {String} str String to check
* @returns {Boolean}
*/
function isInt (str) {
return str !== '' && Number.isInteger(+str) && !str.includes('.')
}
/**
* Retrieves the first direct trimmed text content of an element, skipping nested elements.
* @param {Element} el The DOM element.
* @returns {String|undefined} The text content or undefined if none found.
*/
function text (el) {
if (!el) return undefined
const texts = []
for (const child of el.childNodes) {
if (child.nodeType === TEXT_NODE) {
const t = child.textContent.trim()
t && texts.push(t)
}
}
return texts.length === 1 ? texts[0] : (!texts.length ? '' : texts)
}
/**
* Returns unique id for the array of nodes combining their ids. It takes only the nodes of one
* level without children.
* @param {Node[]} nodes Array of nodes
* @returns {String} Unique id string
*/
function getNodesId (nodes) {
return nodes.reduce((pre, cur) => pre + ((pre && '-') + cur.id), '')
}
/**
* Returns all possible variants of nodes of the one level. Is used to compare all possible
* nodes variants of pseudo tree and DOM nodes of one level. It works in a revert way to obtain
* long subsets first and short at the end.
* @param {Node[]} nodes Array of nodes we have create variants for
* @returns {Nodes[][]} Array of arrays of Nodes combinations
*
* @example
* const nodes = [{id: 0, tag: '*'}, {id: 1, tag: 'span'}]
* // returns [
* // [{id: 0, tag: '*'}, {id: 1, tag: 'span'}],
* // [{id: 1, tag: 'span'}],
* // [{id: 0, tag: '*'}]
* // ]
* const vars = subsets(nodes)
*/
function subsets (nodes) {
if (!nodes) return []
const len = nodes.length
const size = 1 << len
const result = new Array(size - 1)
for (let i = size - 1, idx = 0; i > 0; i--) {
const subset = []
for (let j = 0; j < len; j++) (i & (1 << j)) && subset.push(nodes[j])
result[idx++] = subset
}
return result
}
/**
* Makes a deep copy of the object or an array. This is your responsibility to pass obj
* parameter without circular nodes. We know exactly the type of object we pass, so we do
* optimizations (decrease recursion calls) if it's an object.
* @param {Object|Array} obj Object or array to copy
* @returns {Object|Array} Copy of object or array
*/
function copy (obj) {
if (Array.isArray(obj)) {
const len = obj.length
const cpy = new Array(len)
for (let i = 0; i < len; i++) cpy[i] = copy(obj[i])
return cpy
} else if (typeof obj === 'object') {
/**
* We know that only objects will be copied, so we do it without additional recursion steps
* for every object property with simple type like string, number, undefined, ...
*/
const cpy = { id: obj.id, tag: obj.tag, el: obj.el, score: obj.score, maxScore: obj.maxScore }
// obj.text related to textTag, so we copy them together and if textTag exists
if (obj.textTag) {
cpy.textTag = obj.textTag
cpy.text = obj.text
obj.textType && (cpy.textType = obj.textType)
obj.textVal && (cpy.textVal = obj.textVal)
}
// the same for attrTag and attr props
if (obj.attrTag) { cpy.attrTag = [obj.attrTag[0], obj.attrTag[1]]; cpy.attr = obj.attr }
obj.children && (cpy.children = copy(obj.children))
return cpy
}
return obj
}
/**
* Recursively traverses an object or array, applying a callback to each node. The speed here is
* not so important, so we call itself more times.
* @param {Object|Array} obj Object or array to traverse.
* @param {Function} cb Callback function (cb(node, key)) for every node.
* @param {Function} endCb Callback, which is called at the end of walking on an object
* @param {Number} level Recursion level
*/
function walk (obj, cb, endCb, level = 0) {
if (Array.isArray(obj)) {
for (let i = 0; i < obj.length; i++) { cb(obj[i], i, level); walk(obj[i], cb, endCb, level) }
} else if (typeof obj === 'object') {
for (const p in obj) {
if (p !== 'el') {
const oldLevel = level
level = cb(obj[p], p, level)
walk(obj[p], cb, endCb, level)
level = oldLevel
}
}
endCb?.(obj, level)
} else cb(obj)
}
/**
* Checks if a tag from pseudo tree-like node is similar to DOM element tag name. If a tag in a
* pseudo tree is equal to *, then any tag is equal. It uses cache for the tag name.
* @param {Object} node Pseudo tree like template's node
* @param {Element} el DOM element
* @returns {Boolean}
*/
function sameTag (node, el) {
if (node.tag === '*') return true
return TAG_NAME_CACHE.getName(el) === node.tag
}
/**
* Checks if the tag's text has a type "type" and value "val". Is used for checking tag's text
* for the specified type. For example: str, int, float, func,... If user sets "func" type it
* means this function should be defined in a global context (window under browser and self
* under Node.js).
* @param {Element} el Current DOM element we are sezrching on
* @param {String} text Text we are checking
* @param {String} type str, int, float,...
* @param {String} val Additional parameter for type. For example for the func type we provide
* custom function name to call
* @returns {Boolean}
*/
function sameType (el, text, type, val) {
switch (type) {
/* eslint-disable no-multi-spaces */
case 'int' : return isInt(text)
case 'float' : return isFloat(text)
case 'with' : return text.includes(val)
case 'func' : {
const obj = typeof global === 'undefined' ? self : global // eslint-disable-line no-undef
if (!obj) {
console.warn(`Unknown environment. Impossible to find global or self objects to run ${val} function`)
return false
}
const fn = obj[val]
if (!fn) {
console.warn(`Function ${val} is not found in a global context`)
return false
}
return !!fn(text, el)
}
case 'parent': return TAG_NAME_CACHE.getName(PARENT_CACHE.getParent(el)) === val.toUpperCase()
case 'str' : return true
case 'empty' : return text.trim() === ''
}
/* eslint-enable no-multi-spaces */
return false
}
/**
* This function calculates total total maxScore and maxScore for every node. maxScore is used
* during matching to skip nodes with lower score for optimization. Global maxScore - it's a global
* score of the root node(s).
* @param {Node[]} tplNodes Pseudo template JSON tree
* @returns {[Number, Number]} [depth, maxScore] maximum depth in a DOM tree we will search &
* maxScore of the root node(s)
*/
function initTpl (tplNodes) {
let depth = options.minDepth
walk(tplNodes, (d, prop, level) => {
if (!isObj(d)) return prop !== 'children' ? level : Math.round(level * options.completeCoef) || 1
if (d?.tag) { d.maxScore += options.tagScore; depth++ }
d.textTag && !d.textType && (d.maxScore += options.tagTextScore)
d.textType && (d.maxScore += options.tagTextTypeScore)
d.attrTag && (d.maxScore += options.tagAttrScore)
})
walk(tplNodes, (d, prop, level) => {
if (!isObj(d)) return prop !== 'children' ? level : Math.round(level * options.completeCoef) || 1
}, (o, level) => {
if (o.maxScore) {
o?.children && (o.maxScore = o.children.reduce((pre, cur) => pre + cur.maxScore, o.maxScore || 0))
o.maxScore += (depth - level)
}
})
/**
* To calculate maxScore we have to sum all first level nodes
*/
const maxScore = tplNodes.reduce((pre, cur) => pre + cur.maxScore, 0)
return [depth, maxScore]
}
/**
* Initializes match() function and it's data. Should be called before first match() call
* @param {Element|String} firstEl DOM element we are starting search from or css query
* @return {Element}
*/
function initMatch (firstEl) {
firstEl = isStr(firstEl) ? document.querySelector(firstEl) : firstEl
rootFirstEl = firstEl
rootEl = firstEl.parentNode
SCORE_CACHE.clear()
TAG_NAME_CACHE.clear()
PARENT_CACHE.clear()
FIRST_CHILD_CACHE.clear()
NEXT_CACHE.clear()
TEXT_CACHE.clear()
PARENT_CACHE.set(firstEl, rootEl)
startTime = performance.now()
return firstEl
}
/**
* Finds all nodes in a DOM tree according to JSON tree. The starting format of one JSON node
* is following: {id: Number, el: Element, tag: String, textTag: String, children: [], maxScore:
* Number, attrTag: Array, text: String, attr: String}, where:
*
* id - unique identifier of the pseudo node
* el - reference to DOM element
* tag - name of the HTML tag we are looking for or *
* score - found score of the current node (+1 if tag exist or *, +1 if textTag is not empty and
* HTML element also contains a text in it)
* text - text grabbed from HTML tag
* textTag - name of the property we will put data in
* textType - type of the tag's text we are looking for
* textVal - additional value. Depends on textType
* children - an array of the same nodes to support recursion search
* attrTag - an array of two elements with name of the attribute tag and name of the attribute of
* the DOM element.
* maxScore - maxScore, calculated before search
*
* After all nodes will be found "score" and "text" properties will be added into the result nodes.
* The minimum node contains only "tag" property. Also, there are optional properties: textType and
* textVal. They are used if we need to specify concrete type of the data we are looking for. For
* example it may be a float number or integer or even an empty string. This function uses fuzzy
* trees comparison and don't violate sequence of nodes on the same level. So if we are looking for
* span, div, h1 on one level their order is important. It's possible to have tags between them,
* but div is always stays after span and h1 is always after div and span.
*
* @param {String} tplNodesId Unique id of an array of nodes in tplNodes parameter
* @param {Node[]} tplNodes Array of tpl nodes we have to match
* @param {Element} firstEl First element in a DOM associated with tplNodes[0]
* @param {Number} level Current up or down level we are searching on
* @param {Number} maxLevel Max level we may go to through search
* @param {Element} extraParentEl If firstEl === null, only first recursion (going upper) has sense
* so we put the parent into separate parameter
* @returns {[score, Nodes[]|undefined]}
*/
function match (tplNodesId, tplNodes, firstEl, level, maxLevel, extraParentEl = null) {
if (!tplNodes?.length || performance.now() - startTime > options.executionTime) {
return [0, undefined]
}
let maxScore = 0
let maxNodes
if (level <= maxLevel) {
/**
* First, we check current nodes array in one level upper without combinations. This variant
* is used when DOM tree has a lack of nodes between current nodes in pseudo tree and DOM
* tree. For example (pseudo tree on the left and DOM tree on the right):
*
* h1
* h1
* h1 -> div
* h1
*
* In this example, we have to find two h1 tags of a pseudo tree in a DOM tree. Please pay
* attention on the fact that two h1 tags in a DOM tree are on the root level and we have
* to skip div tag in a DOM tree during search. Max possible score here is 2 and algorithm
* returns 2 (2 h1 tags found on an upper level).
*/
const upEl = PARENT_CACHE.getParent(firstEl) || extraParentEl
if (upEl !== rootEl) {
const parentEl = PARENT_CACHE.getParent(upEl)
const firstChildEl = parentEl === rootEl ? upEl : FIRST_CHILD_CACHE.getFirstChild(parentEl)
/**
* Optimization logic: we have to skip nodes with lower score, because other node is
* better than current.
*/
const score = SCORE_CACHE.getScore(firstChildEl, tplNodesId)
if (score === undefined || score > maxScore) {
const newLevel = Math.round(level * options.completeCoef) || 1
const [upScore, upNodes] = match(tplNodesId, tplNodes, firstChildEl, newLevel, maxLevel)
if (upScore > maxScore && upNodes) {
maxScore = upScore
if (score === undefined && tplNodesId !== undefined && maxLevel - newLevel > options.minDepth) {
SCORE_CACHE.setScore(firstChildEl, tplNodesId, maxScore)
}
maxNodes = upNodes
}
}
}
/**
* Second, we check similar nodes in a one level deeper without combinations. This variant
* is used when DOM tree has extra nodes between current nodes in pseudo tree. For example
* (pseudo tree on the left and DOM tree on the right):
*
* h1
* h1 -> div
* h1
* h1
*
* In this example, we have to find two h1 tags inside the div, but div itself should be
* skipped. So max possible score here === 2 and algorithm should return 2 (two h1 tags found
* inside the div tag).
*/
let el = firstEl
while (el) {
const firstChildEl = FIRST_CHILD_CACHE.getFirstChild(el)
if (firstChildEl) {
/**
* Optimization logic: we have to skip nodes with lower score, because other node is
* better than current.
*/
const score = SCORE_CACHE.getScore(firstChildEl, tplNodesId)
if (score === undefined || score > maxScore) {
const newLevel = Math.round(level * options.completeCoef) || 1
const [deepScore, deepNodes] = match(tplNodesId, tplNodes, firstChildEl, newLevel, maxLevel)
if (deepScore > maxScore && deepNodes) {
maxScore = deepScore
if (score === undefined && tplNodesId !== undefined && maxLevel - newLevel > options.minDepth) {
SCORE_CACHE.setScore(firstChildEl, tplNodesId, maxScore)
}
maxNodes = deepNodes
}
}
}
el = el === rootFirstEl ? null : NEXT_CACHE.getNext(el)
}
}
// No children in a current DOM node. No sense to search deeper
if (!firstEl) return [maxScore, maxNodes]
/**
* Third, we check similar nodes on the same level with all possible combinations of pseudo
* tree. For example if we have 2 nodes: [{tag: 'div'}, {tag: 'span'}], we will have 3
* possible combinations with different max scores:
*
* 1. [{tag: 'div'}, {tag: 'span'}] // max score 2
* 2. [{tag: 'span'}] // max score 1
* 3. [{tag: 'div'}] // max score 1
*
* It picks every combination of pseudo nodes and try to find it in a DOM tree. The tree with
* maximized score will be returned as a result.
*/
const combinations = subsets(tplNodes)
for (let c = 0; c < combinations.length; c++) {
/**
* If current maxScore is bigger than score of the next combination, it means searching
* next combination has no sense. We have to skip it to make this matching faster.
*/
const combRef = combinations[c]
const combScore = combRef.reduce((pre, cur) => pre + cur.maxScore, 0)
if (maxScore >= combScore) continue
const comb = copy(combRef)
let i = 0
comb[0].el = firstEl
while (true) {
const node = comb[i]
const el = node.el
if (el) {
node.score = 0
// here we check tag name, tag text and attribute
if (sameTag(node, el)) {
node.score += options.tagScore
if (node.textTag) {
const t = TEXT_CACHE.getText(el)
if (node.textType) {
if (sameType(el, t, node.textType, node.textVal)) {
node.score += options.tagTextTypeScore
node.text = t
} else { node.score -= options.tagTextTypeScore; delete node.text }
} else if (t) { node.text = t; node.score += options.tagTextScore }
}
if (node.attrTag) {
const a = el.getAttribute(node.attrTag[1])
a && (node.attr = a) && (node.score += options.tagAttrScore)
}
}
// Here we go deeper and check inner nodes
const subNodes = node.children
if (subNodes?.length) {
const firstChildEl = FIRST_CHILD_CACHE.getFirstChild(el)
const newLevel = Math.round(level * options.completeCoef) || 1
const extraEl = !firstChildEl ? el : null
const [score, nodes] = match(getNodesId(subNodes), subNodes, firstChildEl, newLevel, maxLevel, extraEl)
// The condition must be > 0
if (node.score > 0 || score > 0) node.score += (score + (node.score > 0 ? maxLevel - level : 0))
nodes?.length && (node.children = nodes)
const nodeId = `${node.id}`
if (node.id !== undefined && SCORE_CACHE.getScore(el, nodeId) === undefined && maxLevel - newLevel > options.minDepth) {
SCORE_CACHE.setScore(el, nodeId, node.score) // score for only 1 node
}
// The condition must be > 0
} else if (node.score > 0) node.score += (maxLevel - level)
// all nodes found let's check if it's a best score
if (i >= comb.length - 1) {
i = comb.length - 1
const nodesScore = comb.reduce((acc, cur) => acc + cur.score || 0, 0)
if (nodesScore > maxScore) { maxScore = nodesScore; maxNodes = copy(comb) }
/**
* Here we reset all children of current combination, because it's structure may
* change. Every time we compare a new set of nodes in a current level we have to
* start with original combination. Otherwise there is a possible issue, where
* previous comparison may affect future compare.
*/
comb[i].children && (comb[i].children = copy(combinations[c][i].children))
} else i++
} else {
/**
* Here we reset all children of current combination, because it's structure may
* change. Every time we compare a new set of nodes in a current level we have to
* start with original combination. Otherwise there is a possible issue, where
* previous comparison may affect future compare.
*/
i && comb.length > 1 && comb[i].children && (comb[i].children = copy(combinations[c][i].children))
i--
if (i < 0) break
}
comb[i].el = comb[i].el === rootFirstEl ? null : NEXT_CACHE.getNext(comb[i]?.el || el)
}
}
return [maxScore, maxNodes]
}
/**
* This function collects final data map, which consists of text and attribute values and checks if
* there are errors in it
* @param {Node[]} nodes Nodes returned by match() function with meta data inside
* @returns {Object} Collected map
*/
function getMap (nodes) {
const map = {}
walk(nodes, d => {
if (!isObj(d)) return
if (d.textTag && d.text !== undefined) {
const tag = d.textTag
map[tag] && console.error(`Two or more equal text tags were found. Text tag: "${tag}"`)
map[tag] = d.text
}
if (d.attrTag && d.attr) {
const tag = d.attrTag[0]
map[tag] && console.error(`Two or more equal attr tags were found. Attr tag: "${tag}"`)
map[tag] = d.attr
}
})
return map
}
/**
* Main function you should call to harvest the data from an DOM tree by template. First,
* create a pseudo tree template like in example below. div, h1, span, img - are tag names
* you are looking for. title, price, img - optional names of properties in a returned map,
* where found text|attribute will be placed. 2 spaces before tags - are nesting level.
* Difference between parent and child nodes may be only 2 spaces if we go from up to down.
* If we go from bottom to up it may be different. This template should pass as a first
* parameter to harvest() function. Second, firstEl - should point to the first element
* in a DOM.
*
* @param {String} tpl template of pseudo tree-like string
* @param {Element|String} element Reference to the first DOM element for nodes[0] or css query
* @param {Object|undefined} opt Harvester options
* @returns {[map: Object, maxScore: Number, foundScore: Number, foundNodes: Array]} map -
* JavaScript object with all text tags and attribute tags in it;maxScore - maximum score.
* It means that found tree is identical to your pseudo tree-like template; foundScore -
* score of found tree may be between [0..maxScore]. Shows similarity between maximum score
* and found; foundNodes - found Array based tree with all metadata in it;
*
* @example
* const tpl = `
* div
* h1{title}
* span{price:float}
* img[img=href]`
* harvest(tpl, $('div')) // [{title: 'Title', price: '12.34', img: 'http://...'}, 8, 7, [...]]
*/
function harvest (tpl, element, opt = {}) {
if (!isObj(opt)) { console.error('"opt" argument is not an object'); return [{}, 0, 0, undefined] }
buildOptions(opt)
const tplNodes = toTree(tpl)
const [depth, maxScore] = initTpl(tplNodes)
if (!element) return [{}, maxScore, 0, []]
const firstEl = initMatch(element)
const [score, nodes] = match(getNodesId(tplNodes), tplNodes, firstEl, 0, depth)
const map = getMap(nodes)
return opt.dataOnly ? map : [map, maxScore, score, nodes]
}
if (typeof module === 'object' && typeof module.exports === 'object') module.exports = { toTree, harvest, buildOptions }