-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathTopFileGenerator.php
More file actions
137 lines (114 loc) · 4.69 KB
/
TopFileGenerator.php
File metadata and controls
137 lines (114 loc) · 4.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
<?php
include_once 'MyHeap.php'; // Heap class
/**
* This function generates a 'Top' file from a sorted log file
* For each criterion (which index is $criterionIndex),
* it counts the occurrence of the values which index are $valueIndex
*
* @param $filename, name of the sorted log file
* @param $topFilename, name of the top file that will be generated
* @param $topNumber, number of the Top such as 'Top ($topNumber)' is generated
* @param $nbLogParam, number of log parameters per line
* @param $criterionIndex, the index of the criterion for the top (0: song_id, 1: usr_id, 2: country_code)
* @param $valueIndex, the index of the value that will be counted (0: song_id, 1: usr_id, 2: country_code)
*/
function generateTopFile($filename, $topFilename, $topNumber, $nbLogParam, $criterionIndex, $valueIndex) {
global $red, $green, $blue, $noColor, $OK;
echo $blue."*** Generating ".$topFilename." ***".$noColor."\n";
// The sorted log file is opened
$fh = fopen($filename, 'r') or die($red."Oops, couldn't open ".$filename."!".$noColor."\n\n");
// The top file is created
$fp = fopen($topFilename, 'w') or die($red."Oops, couldn't create a new file!".$noColor."\n\n");
$currentCriterionID = 0; // ID of the criterion that is currently analyzed
$currentValueID = 0; // ID of the value that is currently counted
$counter = 0; // Counter for the values
// In order to keep the Top updated for each criterion, the data is stored in a MinHeap each time.
// The structure of this array is as the following: list($valueID, $counter) = $array;
// Thus, MyHeap class is used and sorted according to $counter, and then according to $valueID in case of equality
$VALUE_INDEX = 0;
$COUNTER_INDEX = 1;
$heap = new MyHeap($COUNTER_INDEX, $VALUE_INDEX);
echo "Reading and counting data... ";
// The first line is read
if(!feof($fh)) {
$line = trim(fgets($fh));
$row = explode('|',$line);
$currentCriterionID = $row[$criterionIndex];
$currentValueID = $row[$valueIndex];
$counter++;
} else {
// File shouldn't be empty at this point; if it is, something wrong must have happened meanwhile
exit($red."Unexpected EOF reached... Script aborted!".$noColor."\n");
}
while(!feof($fh)) {
$line = trim(fgets($fh));
$row = explode('|',$line);
if(count($row) == $nbLogParam) {
// When next criterion is detected, the Top of the $currentCriterionID is written in the file
if($currentCriterionID != $row[$criterionIndex]) {
// First, insert the last data of $currentCriterionID into the heap
$heap->insert(array($currentValueID, $counter));
if($heap->count() > $topNumber) {
$heap->extract(); // Maximum length of the heap is equal to $topNumber
}
// Empty MinHeap and reverse it (with a stack) to get the Top in descending order
// Note: An array is used as a stack (instead of SplStack) for better performance
$stack = array();
while(!$heap->isEmpty()) {
$stack[] = $heap->extract();
}
// Write the Top of the $currentCriterionID (in our example: Top 50)
// Format: country|sng_id1:n1,sng_id2:n2,sng_id3:n3,...,sng_id50:n50
$data = array_pop($stack);
fwrite($fp, $currentCriterionID."|".$data[$VALUE_INDEX].":".$data[$COUNTER_INDEX]);
while(!empty($stack)) {
$data = array_pop($stack);
fwrite($fp, ",".$data[$VALUE_INDEX].":".$data[$COUNTER_INDEX]);
}
fwrite($fp, "\n");
unset($stack);
// Update $currentCriterionID, $currentValueID and $counter
$currentCriterionID = $row[$criterionIndex];
$currentValueID = $row[$valueIndex];
$counter = 1;
} elseif($currentValueID != $row[$valueIndex]) {
// When next value is detected, insert the last data of $currentCriterionID into the heap
$heap->insert(array($currentValueID, $counter));
if($heap->count() > $topNumber) {
$heap->extract(); //
}
// Then update $currentValueID and $counter
$currentValueID = $row[$valueIndex];
$counter = 1;
} else {
// Else, keep counting
$counter++;
}
} else {
// Data is expected to have ($nbLogParam) values;
// if not, it means that the end of this file has been reached
break;
}
}
// Repeat the process for the last Top
$heap->insert(array($currentValueID, $counter));
if($heap->count() > $topNumber) {
$heap->extract();
}
$stack = array();
while(!$heap->isEmpty()) {
$stack[] = $heap->extract();
}
$data = array_pop($stack);
fwrite($fp, $currentCriterionID."|".$data[$VALUE_INDEX].":".$data[$COUNTER_INDEX]);
while(!empty($stack)) {
$data = array_pop($stack);
fwrite($fp, ",".$data[$VALUE_INDEX].":".$data[$COUNTER_INDEX]);
}
echo $OK.$blue."*** DONE! ***".$noColor."\n";
unset($stack);
unset($heap);
fclose($fh);
fclose($fp);
}
?>