-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpreprocess.py
More file actions
125 lines (103 loc) · 3.21 KB
/
preprocess.py
File metadata and controls
125 lines (103 loc) · 3.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import argparse
import multiprocessing as mp
import shutil
import sys
from argparse import Namespace
from pathlib import Path
import pandas as pd
root = Path(__file__).resolve().parent
sys.path.append(str(root))
from src.config import BugInfo
from src.interfaces.d4j import (
check_out,
get_failed_tests,
get_properties,
run_all_tests,
)
from src.repograph.construct_graph import create_repo_graph
def worker(bug_name, config_file):
proj, bug_id = bug_name.split("_")
run_one(proj, bug_id, config_file)
def check_exists(bug_name):
proj, bug_id = bug_name.split("_")
combined_graph_file = (
root
/ "dataset"
/ "bugInfo"
/ proj
/ f"{proj}@{bug_id}"
/ "combined_graph.pkl"
)
if combined_graph_file.exists():
print(f"Skip {bug_name}")
return True
return False
def main(dataset_file, config_file, num_processes):
df = pd.read_csv(dataset_file, header=None)
bug_names = df.iloc[:, 0].tolist()
# TODO: Control bug for preprocess
bug_names = ["Chart_1"]
if num_processes > 1:
preprocess_failed = []
with mp.Pool(num_processes) as pool:
async_results = []
for bug_name in bug_names:
if check_exists(bug_name):
continue
async_result = pool.apply_async(
worker, (bug_name, config_file)
)
async_results.append(async_result)
for i, async_result in enumerate(async_results):
try:
async_result.get()
except Exception as e:
preprocess_failed.append(f"{bug_names[i]} {str(e)}")
preprocess_failed_file = root / "preprocess_failed.txt"
if preprocess_failed:
with open(preprocess_failed_file, "w") as f:
f.write("\n".join(preprocess_failed))
else:
for bug_name in bug_names:
if check_exists(bug_name):
continue
worker(bug_name, config_file)
# clean up
checkout_tmp_path = (
root / "DebugResult" / BugInfo.get_config_hash(config_file)
)
shutil.rmtree(checkout_tmp_path)
def run_one(proj, bug_id, config_file):
args = Namespace(project=proj, bugID=bug_id, config=config_file)
bug_info = BugInfo(args)
check_out(bug_info)
# collect basic bug information
get_properties(bug_info)
test_failure_obj = get_failed_tests(bug_info)
run_all_tests(bug_info, test_failure_obj)
# create the raw repository graph
create_repo_graph(bug_info, test_failure_obj)
# clean up
shutil.rmtree(bug_info.proj_tmp_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Preprocess all bugs")
parser.add_argument(
"--dataset",
type=str,
help="dataset file",
default="dataset/all_bugs.csv",
)
parser.add_argument(
"--config",
type=str,
help="config file",
default="config/preprocess.yml",
)
parser.add_argument(
"--processes",
type=int,
help="processes",
default=1,
)
args = parser.parse_args()
main(args.dataset, args.config, args.processes)