Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Debugger Plugin] Fix Python unit tests for TF2 #2583

Merged
merged 7 commits into from
Aug 21, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 11 additions & 12 deletions tensorboard/plugins/debugger/debug_graphs_helper_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@

import tensorflow as tf
from tensorflow.python import debug as tf_debug
# See discussion on issue #1996 for private module import justification.
from tensorflow.python import tf2 as tensorflow_python_tf2
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please add

# See discussion on issue #1996 for private module import justification.
from tensorflow.python import tf2 as tensorflow_python_tf2

(as in test_util.py)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

from tensorflow.python.debug.lib import grpc_debug_test_server

from tensorboard.compat.proto import config_pb2
Expand Down Expand Up @@ -85,7 +87,6 @@ def _createTestGraphAndRunOptions(self, sess, gated_grpc=True):
debug_urls=self.debug_server_url)
return z, run_options

@test_util.run_v1_only('Ops differ. Similar to [1].')
def testExtractGatedGrpcTensorsFoundGatedGrpcOps(self):
with tf.compat.v1.Session() as sess:
z, run_options = self._createTestGraphAndRunOptions(sess, gated_grpc=True)
Expand All @@ -108,15 +109,11 @@ def testExtractGatedGrpcTensorsFoundGatedGrpcOps(self):
gated_debug_ops = [
(item[0], item[2], item[3]) for item in gated_debug_ops]

# TODO(#1705): TF 2.0 breaks below.
self.assertIn(('a', 0, 'DebugIdentity'), gated_debug_ops)
self.assertIn(('a/read', 0, 'DebugIdentity'), gated_debug_ops)
self.assertIn(('b', 0, 'DebugIdentity'), gated_debug_ops)
self.assertIn(('b/read', 0, 'DebugIdentity'), gated_debug_ops)
self.assertIn(('c', 0, 'DebugIdentity'), gated_debug_ops)
self.assertIn(('c/read', 0, 'DebugIdentity'), gated_debug_ops)
self.assertIn(('d', 0, 'DebugIdentity'), gated_debug_ops)
self.assertIn(('d/read', 0, 'DebugIdentity'), gated_debug_ops)

self.assertIn(('x', 0, 'DebugIdentity'), gated_debug_ops)
self.assertIn(('y', 0, 'DebugIdentity'), gated_debug_ops)
self.assertIn(('z', 0, 'DebugIdentity'), gated_debug_ops)
Expand Down Expand Up @@ -151,9 +148,6 @@ def testExtractGatedGrpcTensorsFoundNoGatedGrpcOps(self):
self.assertEqual([], gated_debug_ops)


@test_util.run_v1_only((
'Graph creates different op structure in v2. See '
'debug_graphs_helper_test.py[1].'))
class BaseExpandedNodeNameTest(tf.test.TestCase):

def testMaybeBaseExpandedNodeName(self):
Expand All @@ -174,9 +168,14 @@ def testMaybeBaseExpandedNodeName(self):
self.assertEqual(
'bar/b/read',
graph_wrapper.maybe_base_expanded_node_name('bar/b/read'))
# TODO(#1705): TF 2.0 tf.add creates nested nodes.
self.assertEqual(
'baz/c', graph_wrapper.maybe_base_expanded_node_name('baz/c'))

if tensorflow_python_tf2.enabled():
# NOTE(#1705): TF 2.0 tf.add creates nested nodes.
self.assertEqual(
'baz/c/(c)', graph_wrapper.maybe_base_expanded_node_name('baz/c'))
else:
self.assertEqual(
'baz/c', graph_wrapper.maybe_base_expanded_node_name('baz/c'))


if __name__ == '__main__':
Expand Down
37 changes: 20 additions & 17 deletions tensorboard/plugins/debugger/interactive_debugger_plugin_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
import numpy as np
import portpicker # pylint: disable=import-error
from six.moves import urllib # pylint: disable=wrong-import-order
import tensorflow as tf # pylint: disable=wrong-import-order
import tensorflow.compat.v1 as tf # pylint: disable=wrong-import-order
from tensorflow.python import debug as tf_debug # pylint: disable=wrong-import-order
from werkzeug import test as werkzeug_test # pylint: disable=wrong-import-order
from werkzeug import wrappers # pylint: disable=wrong-import-order
Expand All @@ -44,11 +44,14 @@
from tensorboard.plugins.debugger import interactive_debugger_plugin
from tensorboard.util import test_util

# These unit tests for Debugger Plugin V1 are tied to TF1.x behavior
# (`tf.Session`s).
tf.disable_v2_behavior()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting: simply adding tf.compat.v1.disable_v2_behavior() without
the rest of the patch doesn’t suffice to fix the timeouts on my end, so
some of the API differences must(?) be causing these unbounded
timeouts—do we know which ones?

In the short term, this patch is great because it lets us continue to
run the tests, but ideally we’d like to get rid of tf.compat.v1
wherever possible, too (#1718).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a comment here. The Debugger Plugin is currently tied to v1 behavior (tf.Sessions), which is the reason why it is necessary to pin V1 behavior here. When tfdbg V2 comes along, this will of course no longer be necessary.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, that makes sense. But if session and node behavior were the only
salient changes then we would expect adding disable_v2_behavior() to
fix the tests, right? And yet if we make only that change then the
testMultipleSessionRunsTensorValueFullHistory test case hangs at
various points—sometimes in the listen call in setUp, sometimes in
the first call to _serverGet in the test body. I was wondering why
that is, and why the remaining changes, which just change tf.foo to
tf.compat.v1.foo, do fix the tests.

(This isn’t necessarily a blocking question, but I would be interested
to learn how this PR actually works.)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I confirmed what you said. It seems that making only the change of

  1. Using import tensorflow.compat.v1 as tf
  2. Using tf.disable_v2_behavior()

leads to hanging tests. This looks like some unexpected behavior in how tf.compat.v1.compat.v1.... behaves.

As you said, this doesn't have to be blocking. I can make a TODO item for myself to investigate it and potentially file bug for tensorflow while merging this PR. LMK WYT.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using tf.compat.v1.compat.v1 will simply not work, in either TF 1.x or
TF 2.x. But making only the disable_v2_behavior() change seems like
it should work, because the rest of the code already uses
tf.compat.v1.Session() explicitly.

This is good to merge; feel free to investigate iff you feel like it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

disable_v2_behavior() exists only in tf from TF1 or tf.compat.v1 from TF2. So to use it, we'd have to do import tf.compat.v1 as tf regardless (TF1 tests are passing, so they are out of concern). But if you use that and don't change anything below, you'll just end up using tf.compat.v1.compat.v1.*, which I think is what causes the hanging.

I'll merge the PR. Thanks for the review.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let me be concrete: the patch that I’m talking about is simply

diff --git a/tensorboard/plugins/debugger/interactive_debugger_plugin_test.py b/tensorboard/plugins/debugger/interactive_debugger_plugin_test.py
index 1b127456..653af030 100644
--- a/tensorboard/plugins/debugger/interactive_debugger_plugin_test.py
+++ b/tensorboard/plugins/debugger/interactive_debugger_plugin_test.py
@@ -45,6 +45,9 @@ from tensorboard.plugins.debugger import interactive_debugger_plugin
 from tensorboard.util import test_util
 
 
+tf.compat.v1.disable_v2_behavior()
+
+
 _SERVER_URL_PREFIX = '/data/plugin/debugger/'
 
 

against master prior to this PR (i.e., 2b96c2a).

This properly disables V2 behavior, does not use compat.v1.compat.v1,
and uses tf.compat.v1.Summary, but still hangs.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And tf.compat.v1.compat.v1 is a hard error, not a loop.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Acknowledged. Thanks for the concrete info.



_SERVER_URL_PREFIX = '/data/plugin/debugger/'


@test_util.run_v1_only('Test fails to run and clean up properly; they time out.')
class InteractiveDebuggerPluginTest(tf.test.TestCase):

def setUp(self):
Expand Down Expand Up @@ -116,14 +119,14 @@ def _deserializeResponse(self, response):
def _runSimpleAddMultiplyGraph(self, variable_size=1):
session_run_results = []
def session_run_job():
with tf.compat.v1.Session() as sess:
with tf.Session() as sess:
a = tf.Variable([10.0] * variable_size, name='a')
b = tf.Variable([20.0] * variable_size, name='b')
c = tf.Variable([30.0] * variable_size, name='c')
x = tf.multiply(a, b, name="x")
y = tf.add(x, c, name="y")

sess.run(tf.compat.v1.global_variables_initializer())
sess.run(tf.global_variables_initializer())

sess = tf_debug.TensorBoardDebugWrapperSession(sess, self._debugger_url)
session_run_results.append(sess.run(y))
Expand All @@ -134,12 +137,12 @@ def session_run_job():
def _runMultiStepAssignAddGraph(self, steps):
session_run_results = []
def session_run_job():
with tf.compat.v1.Session() as sess:
with tf.Session() as sess:
a = tf.Variable(10, dtype=tf.int32, name='a')
b = tf.Variable(1, dtype=tf.int32, name='b')
inc_a = tf.compat.v1.assign_add(a, b, name='inc_a')
inc_a = tf.assign_add(a, b, name='inc_a')

sess.run(tf.compat.v1.global_variables_initializer())
sess.run(tf.global_variables_initializer())

sess = tf_debug.TensorBoardDebugWrapperSession(sess, self._debugger_url)
for _ in range(steps):
Expand All @@ -151,15 +154,15 @@ def session_run_job():
def _runTfGroupGraph(self):
session_run_results = []
def session_run_job():
with tf.compat.v1.Session() as sess:
with tf.Session() as sess:
a = tf.Variable(10, dtype=tf.int32, name='a')
b = tf.Variable(20, dtype=tf.int32, name='b')
d = tf.constant(1, dtype=tf.int32, name='d')
inc_a = tf.compat.v1.assign_add(a, d, name='inc_a')
inc_b = tf.compat.v1.assign_add(b, d, name='inc_b')
inc_a = tf.assign_add(a, d, name='inc_a')
inc_b = tf.assign_add(b, d, name='inc_b')
inc_ab = tf.group([inc_a, inc_b], name="inc_ab")

sess.run(tf.compat.v1.global_variables_initializer())
sess.run(tf.global_variables_initializer())

sess = tf_debug.TensorBoardDebugWrapperSession(sess, self._debugger_url)
session_run_results.append(sess.run(inc_ab))
Expand Down Expand Up @@ -708,7 +711,7 @@ def testGetSourceOpTraceback(self):
def _runInitializer(self):
session_run_results = []
def session_run_job():
with tf.compat.v1.Session() as sess:
with tf.Session() as sess:
a = tf.Variable([10.0] * 10, name='a')
sess = tf_debug.TensorBoardDebugWrapperSession(sess, self._debugger_url)
# Run the initializer with a debugger-wrapped tf.Session.
Expand Down Expand Up @@ -786,7 +789,7 @@ def testCommDataForUninitializedTensorIsHandledCorrectly(self):
def _runHealthPillNetwork(self):
session_run_results = []
def session_run_job():
with tf.compat.v1.Session() as sess:
with tf.Session() as sess:
a = tf.Variable(
[np.nan, np.inf, np.inf, -np.inf, -np.inf, -np.inf, 10, 20, 30],
dtype=tf.float32, name='a')
Expand Down Expand Up @@ -833,11 +836,11 @@ def testHealthPill(self):
def _runAsciiStringNetwork(self):
session_run_results = []
def session_run_job():
with tf.compat.v1.Session() as sess:
with tf.Session() as sess:
str1 = tf.Variable('abc', name='str1')
str2 = tf.Variable('def', name='str2')
str_concat = tf.add(str1, str2, name='str_concat')
sess.run(tf.compat.v1.global_variables_initializer())
sess.run(tf.global_variables_initializer())
sess = tf_debug.TensorBoardDebugWrapperSession(sess, self._debugger_url)
session_run_results.append(sess.run(str_concat))
session_run_thread = threading.Thread(target=session_run_job)
Expand Down Expand Up @@ -890,11 +893,11 @@ def testAsciiStringTensorIsHandledCorrectly(self):
def _runBinaryStringNetwork(self):
session_run_results = []
def session_run_job():
with tf.compat.v1.Session() as sess:
with tf.Session() as sess:
str1 = tf.Variable([b'\x01' * 3, b'\x02' * 3], name='str1')
str2 = tf.Variable([b'\x03' * 3, b'\x04' * 3], name='str2')
str_concat = tf.add(str1, str2, name='str_concat')
sess.run(tf.compat.v1.global_variables_initializer())
sess.run(tf.global_variables_initializer())
sess = tf_debug.TensorBoardDebugWrapperSession(sess, self._debugger_url)
session_run_results.append(sess.run(str_concat))
session_run_thread = threading.Thread(target=session_run_job)
Expand Down
33 changes: 18 additions & 15 deletions tensorboard/plugins/debugger/session_debug_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,18 @@

import numpy as np
import portpicker # pylint: disable=import-error
import tensorflow as tf # pylint: disable=wrong-import-order
import tensorflow.compat.v1 as tf # pylint: disable=wrong-import-order
from tensorflow.python import debug as tf_debug # pylint: disable=wrong-import-order

from tensorboard.plugins.debugger import constants
from tensorboard.plugins.debugger import debugger_server_lib
from tensorboard.util import test_util

# These unit tests for Debugger Plugin V1 are tied to TF1.x behavior
# (`tf.Session`s).
tf.disable_v2_behavior()


@test_util.run_v1_only("Server fails to come up. Requires study.")
class SessionDebugTestBase(tf.test.TestCase):

def setUp(self):
Expand All @@ -67,17 +70,17 @@ def tearDown(self):
if os.path.isdir(self._logdir):
shutil.rmtree(self._logdir)

tf.compat.v1.reset_default_graph()
tf.reset_default_graph()

def _poll_server_till_success(self, max_tries, poll_interval_seconds):
for _ in range(max_tries):
try:
with tf.compat.v1.Session() as sess:
with tf.Session() as sess:
a_init_val = np.array([42.0])
a_init = tf.constant(a_init_val, shape=[1], name="a_init")
a = tf.Variable(a_init, name="a")

run_options = tf.compat.v1.RunOptions(output_partition_graphs=True)
run_options = tf.RunOptions(output_partition_graphs=True)
tf_debug.watch_graph(run_options,
sess.graph,
debug_ops=["DebugNumericSummary"],
Expand Down Expand Up @@ -125,7 +128,7 @@ def _compute_health_pill(self, x):
def _check_health_pills_in_events_file(self,
events_file_path,
debug_key_to_tensors):
reader = tf.compat.v1.python_io.tf_record_iterator(events_file_path)
reader = tf.python_io.tf_record_iterator(events_file_path)
event_read = tf.Event()

# The first event in the file should contain the events version, which is
Expand Down Expand Up @@ -159,7 +162,7 @@ def _check_health_pills_in_events_file(self,
health_pills[debug_key][i])

def testRunSimpleNetworkoWithInfAndNaNWorks(self):
with tf.compat.v1.Session() as sess:
with tf.Session() as sess:
x_init_val = np.array([[2.0], [-1.0]])
y_init_val = np.array([[0.0], [-0.25]])
z_init_val = np.array([[0.0, 3.0], [-1.0, 0.0]])
Expand All @@ -171,14 +174,14 @@ def testRunSimpleNetworkoWithInfAndNaNWorks(self):
z_init = tf.constant(z_init_val, shape=[2, 2])
z = tf.Variable(z_init, name="z")

u = tf.compat.v1.div(x, y, name="u") # Produces an Inf.
u = tf.div(x, y, name="u") # Produces an Inf.
v = tf.matmul(z, u, name="v") # Produces NaN and Inf.

sess.run(x.initializer)
sess.run(y.initializer)
sess.run(z.initializer)

run_options = tf.compat.v1.RunOptions(output_partition_graphs=True)
run_options = tf.RunOptions(output_partition_graphs=True)
tf_debug.watch_graph(run_options,
sess.graph,
debug_ops=["DebugNumericSummary"],
Expand Down Expand Up @@ -221,18 +224,18 @@ def testRunSimpleNetworkoWithInfAndNaNWorks(self):
self.assertEqual(0, report[1].pos_inf_event_count)

def testMultipleInt32ValuesOverMultipleRunsAreRecorded(self):
with tf.compat.v1.Session() as sess:
with tf.Session() as sess:
x_init_val = np.array([10], dtype=np.int32)
x_init = tf.constant(x_init_val, shape=[1], name="x_init")
x = tf.Variable(x_init, name="x")

x_inc_val = np.array([2], dtype=np.int32)
x_inc = tf.constant(x_inc_val, name="x_inc")
inc_x = tf.compat.v1.assign_add(x, x_inc, name="inc_x")
inc_x = tf.assign_add(x, x_inc, name="inc_x")

sess.run(x.initializer)

run_options = tf.compat.v1.RunOptions(output_partition_graphs=True)
run_options = tf.RunOptions(output_partition_graphs=True)
tf_debug.watch_graph(run_options,
sess.graph,
debug_ops=["DebugNumericSummary"],
Expand Down Expand Up @@ -266,7 +269,7 @@ def testConcurrentNumericsAlertsAreRegisteredCorrectly(self):
# Before any Session runs, the report ought to be empty.
self.assertEqual([], self._debug_data_server.numerics_alert_report())

with tf.compat.v1.Session() as sess:
with tf.Session() as sess:
x_init_val = np.array([[2.0], [-1.0]])
y_init_val = np.array([[0.0], [-0.25]])
z_init_val = np.array([[0.0, 3.0], [-1.0, 0.0]])
Expand All @@ -278,7 +281,7 @@ def testConcurrentNumericsAlertsAreRegisteredCorrectly(self):
z_init = tf.constant(z_init_val, shape=[2, 2])
z = tf.Variable(z_init, name="z")

u = tf.compat.v1.div(x, y, name="u") # Produces an Inf.
u = tf.div(x, y, name="u") # Produces an Inf.
v = tf.matmul(z, u, name="v") # Produces NaN and Inf.

sess.run(x.initializer)
Expand All @@ -287,7 +290,7 @@ def testConcurrentNumericsAlertsAreRegisteredCorrectly(self):

run_options_list = []
for i in range(num_threads):
run_options = tf.compat.v1.RunOptions(output_partition_graphs=True)
run_options = tf.RunOptions(output_partition_graphs=True)
# Use different grpc:// URL paths so that each thread opens a separate
# gRPC stream to the debug data server, simulating multi-worker setting.
tf_debug.watch_graph(run_options,
Expand Down