re #BITBUCKET-61 merginng in 0.8.5 release

richpsharp · richpsharp · commit 0d11236dd438 · 2019-09-11T07:03:49.000-07:00
Branch: master
diff --git a/HISTORY.rst b/HISTORY.rst
@@ -4,6 +4,21 @@
 TaskGraph Release History
 =========================
 
+0.8.5 (2019-09-11)
+------------------
+* Dropped support for Python 2.7.
+* Fixed an issue where paths in ``ignore_paths`` were not getting ignored in
+  the case of ``copy_duplicate_artifact=True``.
+* Fixed an issue where the "percent completed" in the logging monitor would
+  sometimes exceed 100%. This occurred when a duplicate task was added to
+  the TaskGraph object.
+* Fixed an issue where a relative path set as a target path would always cause
+  TaskGraph to raise an exception after the task was complete.
+* Fixed an issue where kwargs that were unhashable were not considered when
+  determining if a Task should be re-run.
+* Fixed an issue where files with almost identical modified times and sizes
+  would hash equal in cases even when the filenames were different.
+
 0.8.4 (2019-05-23)
 ------------------
 * Fixed an exception that occurred when two tasks were constructed that
diff --git a/bitbucket-pipelines.yml b/bitbucket-pipelines.yml
@@ -1,14 +1,6 @@
 pipelines:
   default:
     - parallel:
-      - step:
-          name: Tests on python2.7
-          image: python:2.7-stretch
-          caches:
-            - pip
-          script:
-            - pip install tox
-            - tox -e py27-base,py27-psutil
       - step:
           name: Tests on python3.6
           image: python:3.6-stretch
diff --git a/taskgraph/Task.py b/taskgraph/Task.py
@@ -146,6 +146,10 @@ def __init__(
 
         self._taskgraph_started_event = threading.Event()
 
+        # this variable is used to print accurate representation of how many
+        # tasks have been completed in the logging output.
+        self._added_task_count = 0
+
         # use this to keep track of all the tasks added to the graph by their
         # task hashes. Used to determine if an identical task has been added
         # to the taskgraph during `add_task`
@@ -526,6 +530,7 @@ def add_task(
                     raise ValueError(
                         "The task graph is closed and cannot accept more "
                         "tasks.")
+                self._added_task_count += 1
                 if args is None:
                     args = []
                 if kwargs is None:
@@ -661,20 +666,19 @@ def _execution_monitor(self):
                         task_name, time.time() - task_time)
                      for task_name, task_time in self._active_task_list])
 
-            total_tasks = len(self._task_hash_map)
             completed_tasks = len(self._completed_task_names)
             percent_complete = 0.0
-            if total_tasks > 0:
+            if self._added_task_count > 0:
                 percent_complete = 100.0 * (
-                    float(completed_tasks) / total_tasks)
+                    float(completed_tasks) / self._added_task_count)
 
             LOGGER.info(
                 "\n\ttaskgraph execution status: tasks added: %d \n"
                 "\ttasks complete: %d (%.1f%%) \n"
                 "\ttasks waiting for a free worker: %d (qsize: %d)\n"
-                "\ttasks executing (%d): graph is %s\n%s", total_tasks,
-                completed_tasks, percent_complete, self._task_waiting_count,
-                queue_length, active_task_count,
+                "\ttasks executing (%d): graph is %s\n%s",
+                self._added_task_count, completed_tasks, percent_complete,
+                self._task_waiting_count, queue_length, active_task_count,
                 'closed' if self._closed else 'open',
                 active_task_message)
 
@@ -926,13 +930,13 @@ def __init__(
             try:
                 scrubbed_value = _scrub_task_args(arg, self._target_path_list)
                 _ = pickle.dumps(scrubbed_value)
-                kwargs_clean[arg] = scrubbed_value
+                kwargs_clean[key] = scrubbed_value
             except TypeError:
                 LOGGER.warning(
-                    "could not pickle kw argument %s (%s). "
+                    "could not pickle kw argument %s (%s) scrubbed to %s. "
                     "Skipping argument which means it will not be considered "
                     "when calculating whether inputs have been changed "
-                    "on a successive run.", key, arg)
+                    "on a successive run.", key, arg, scrubbed_value)
 
         self._reexecution_info = {
             'func_name': self._func.__name__,
@@ -1125,7 +1129,8 @@ def is_precalculated(self):
         other_arguments = list(_filter_non_files(
             [self._reexecution_info['args_clean'],
              self._reexecution_info['kwargs_clean']],
-            self._target_path_list+self._ignore_path_list,
+            self._target_path_list,
+            self._ignore_path_list,
             self._ignore_directories))
 
         LOGGER.debug("file_stat_list: %s", file_stat_list)
@@ -1174,17 +1179,22 @@ def is_precalculated(self):
                         'Path not found: %s' % path)
                     continue
                 if hash_algorithm == 'sizetimestamp':
-                    size, modified_time = [
-                        float(x) for x in hash_string.split(':')]
+                    size, modified_time, actual_path = [
+                        x for x in hash_string.split('::')]
+                    if actual_path != path:
+                        mismatched_target_file_list.append(
+                            "Path names don't match\n"
+                            "cached: (%s)\nactual (%s)" % (path, actual_path))
                     target_modified_time = os.path.getmtime(path)
-                    if not math.isclose(modified_time, target_modified_time):
+                    if not math.isclose(
+                            float(modified_time), target_modified_time):
                         mismatched_target_file_list.append(
                             "Modified times don't match "
                             "cached: (%f) actual: (%f)" % (
-                                modified_time, target_modified_time))
+                                float(modified_time), target_modified_time))
                         continue
                     target_size = os.path.getsize(path)
-                    if size != target_size:
+                    if float(size) != target_size:
                         mismatched_target_file_list.append(
                             "File sizes don't match "
                             "cached: (%s) actual: (%s)" % (
@@ -1305,7 +1315,7 @@ def _get_file_stats(
 
 
 def _filter_non_files(
-        base_value, keep_list, keep_directories):
+        base_value, keep_list, ignore_list, keep_directories):
     """Remove any values that are files not in ignore list or directories.
 
     Parameters:
@@ -1314,6 +1324,7 @@ def _filter_non_files(
             contains filepaths in any nested structure.
         keep_list (list): any paths found in this list are not filtered.
             All paths in this list should be "os.path.norm"ed.
+        ignore_list (list): any paths found in this list are filtered.
         keep_directories (boolean): If True directories are not filtered
             out.
 
@@ -1325,7 +1336,7 @@ def _filter_non_files(
     if isinstance(base_value, _VALID_PATH_TYPES):
         try:
             norm_path = _normalize_path(base_value)
-            if (norm_path in keep_list or (
+            if norm_path not in ignore_list and (norm_path in keep_list or (
                     os.path.isdir(norm_path) and keep_directories) or
                     not os.path.isfile(norm_path)):
                 yield norm_path
@@ -1341,12 +1352,12 @@ def _filter_non_files(
         for key in base_value.keys():
             value = base_value[key]
             for filter_value in _filter_non_files(
-                    value, keep_list, keep_directories):
+                    value, keep_list, ignore_list, keep_directories):
                 yield (value, filter_value)
     elif isinstance(base_value, (list, set, tuple)):
         for value in base_value:
             for filter_value in _filter_non_files(
-                    value, keep_list, keep_directories):
+                    value, keep_list, ignore_list, keep_directories):
                 yield filter_value
     else:
         yield base_value
@@ -1432,8 +1443,9 @@ def _hash_file(file_path, hash_algorithm, buf_size=2**20):
     """
     if hash_algorithm == 'sizetimestamp':
         norm_path = _normalize_path(file_path)
-        return '%d:%f' % (
-            os.path.getsize(norm_path), os.path.getmtime(norm_path))
+        return '%d::%f::%s' % (
+            os.path.getsize(norm_path), os.path.getmtime(norm_path),
+            norm_path)
     hash_func = hashlib.new(hash_algorithm)
     with open(file_path, 'rb') as f:
         binary_data = f.read(buf_size)
@@ -1445,12 +1457,13 @@ def _hash_file(file_path, hash_algorithm, buf_size=2**20):
 
 def _normalize_path(path):
     """Convert `path` into normalized, normcase, absolute filepath."""
-    norm_path = os.path.normpath(os.path.normcase(path))
+    norm_path = os.path.normpath(path)
     try:
-        return os.path.abspath(norm_path)
+        abs_path = os.path.abspath(norm_path)
     except TypeError:
         # this occurs when encountering VERY long strings that might be
         # interpreted as paths
         LOGGER.warn(
             "failed to abspath %s so returning normalized path instead")
-        return norm_path
+        abs_path = norm_path
+    return os.path.normcase(abs_path)
diff --git a/tests/test_task.py b/tests/test_task.py
@@ -12,16 +12,18 @@
 import logging.handlers
 import multiprocessing
 import mock
+import importlib
 
 import taskgraph
 
 LOGGER = logging.getLogger(__name__)
 
 N_TEARDOWN_RETRIES = 5
 
-# Python 3 relocated the reload function to imp.
-if 'reload' not in __builtins__:
-    from imp import reload
+
+def _noop_function(**kwargs):
+    """Does nothing except allow kwargs to be passed."""
+    pass
 
 
 def _long_running_function(delay):
@@ -167,7 +169,7 @@ def test_version_not_loaded(self):
             with self.assertRaises(RuntimeError):
                 # RuntimeError is a side effect of `import taskgraph`, so we
                 # reload it to retrigger the metadata load.
-                taskgraph = reload(taskgraph)
+                taskgraph = importlib.reload(taskgraph)
 
     def test_single_task(self):
         """TaskGraph: Test a single task."""
@@ -1197,6 +1199,154 @@ def test_duplicate_but_different_target(self):
                 contents = target_file.read()
             self.assertEqual(contents, test_string)
 
+    def test_modifying_functions_with_copy(self):
+        """TaskGraph: test with copy artifacts and ignore inputs."""
+        n_runs_a = 0
+        n_runs_b = 0
+
+        a_path = os.path.join(self.workspace_dir, 'a.txt')
+        b_path = os.path.join(self.workspace_dir, 'b.txt')
+        volatile_path = os.path.join(self.workspace_dir, 'volatile.txt')
+        d_path = os.path.join(self.workspace_dir, 'd.txt')
+
+        b_path_suffix = os.path.join(self.workspace_dir, 'b_suffix.txt')
+        volatile_path_suffix = os.path.join(self.workspace_dir, 'volatile_suffix.txt')
+        d_path_suffix = os.path.join(self.workspace_dir, 'd_suffix.txt')
+
+        with open(a_path, 'w') as a_file:
+            a_file.write('a file')
+
+        def run_a_batch(a_path, b_path, volatile_path, d_path):
+            def _a(a_path, target_path):
+                nonlocal n_runs_a
+                n_runs_a += 1
+                if not os.path.exists(a_path):
+                    raise RuntimeError("a_path doesn't exist")
+                with open(target_path, 'w') as target_file:
+                    target_file.write('_a result')
+
+            def _b(b_path, volatile_path, target_path):
+                nonlocal n_runs_b
+                n_runs_b += 1
+                if not os.path.exists(a_path):
+                    raise RuntimeError("a_path path doesn't exist")
+                with open(volatile_path, 'w') as volitile_file:
+                    volitile_file.write('_b volatile')
+                with open(target_path, 'w') as target_file:
+                    target_file.write('_b result')
+
+            task_graph = taskgraph.TaskGraph(self.workspace_dir, -1, 0)
+            task_a = task_graph.add_task(
+                func=_a,
+                args=(a_path, b_path),
+                target_path_list=[b_path],
+                hash_algorithm='md5',
+                copy_duplicate_artifact=True,
+                task_name='_a task')
+            _ = task_graph.add_task(
+                func=_b,
+                args=(b_path, volatile_path, d_path),
+                target_path_list=[d_path],
+                ignore_path_list=[volatile_path],
+                hash_algorithm='md5',
+                copy_duplicate_artifact=True,
+                dependent_task_list=[task_a],
+                task_name='_b task')
+            task_graph.join()
+            task_graph.close()
+            del task_graph
+
+        run_a_batch(a_path, b_path, volatile_path, d_path)
+        run_a_batch(a_path, b_path, volatile_path, d_path)
+        run_a_batch(a_path, b_path_suffix, volatile_path_suffix, d_path_suffix)
+
+        self.assertTrue(n_runs_a == 1)
+        self.assertTrue(n_runs_b == 1)
+
+    def test_expected_path_list(self):
+        """TaskGraph: test expected path list matches actual path list."""
+        def _create_file(target_path, content):
+            with open(target_path, 'w') as target_file:
+                target_file.write(content)
+
+        task_graph = taskgraph.TaskGraph(self.workspace_dir, -1, 0)
+        # note it is important this is a relative path that does not
+        # contain the drive letter on Windows.
+        absolute_target_file_path = os.path.join(
+            self.workspace_dir, 'a.txt')
+        relative_path = os.path.relpath(absolute_target_file_path)
+
+        _ = task_graph.add_task(
+           func=_create_file,
+           args=(relative_path, 'test value'),
+           target_path_list=[relative_path],
+           task_name='create file')
+
+        task_graph.close()
+        task_graph.join()
+        del task_graph
+
+        self.assertTrue('Ran without crashing!')
+
+    def test_kwargs_hashed(self):
+        """TaskGraph: ensure kwargs are considered in determining id hash."""
+        task_graph = taskgraph.TaskGraph(self.workspace_dir, -1, 0)
+
+        task_a = task_graph.add_task(
+            func=_noop_function,
+            kwargs={
+                'content': ['this value: a']},
+            task_name='noop a')
+
+        task_b = task_graph.add_task(
+            func=_noop_function,
+            kwargs={
+                'content': ['this value b']},
+            task_name='noop b')
+
+        task_graph.close()
+        task_graph.join()
+        del task_graph
+
+        self.assertNotEqual(
+            task_a._task_id_hash, task_b._task_id_hash,
+            "task ids should be different since the kwargs are different")
+
+    def test_same_timestamp_and_value(self):
+        """TaskGraph: ensure identical files but filename are noticed."""
+        task_graph = taskgraph.TaskGraph(self.workspace_dir, -1, 0)
+
+        file_a_path = os.path.join(self.workspace_dir, 'file_a.txt')
+        file_b_path = os.path.join(self.workspace_dir, 'file_b.txt')
+
+        with open(file_a_path, 'w') as file_a:
+            file_a.write('a')
+        with open(file_b_path, 'w') as file_b:
+            file_b.write('a')
+
+        os.utime(file_a_path, (0, 0))
+        os.utime(file_b_path, (0, 0))
+
+        task_a = task_graph.add_task(
+            func=_noop_function,
+            kwargs={
+                'path': file_a_path},
+            task_name='noop a')
+
+        task_b = task_graph.add_task(
+            func=_noop_function,
+            kwargs={
+                'path': file_b_path},
+            task_name='noop b')
+
+        task_graph.close()
+        task_graph.join()
+        del task_graph
+
+        self.assertNotEqual(
+            task_a._task_id_hash, task_b._task_id_hash,
+            "task ids should be different since the filenames are different")
+
 
 def Fail(n_tries, result_path):
     """Create a function that fails after `n_tries`."""