Do you want to try out this notebook? Get a free account (no credit-card reqd) at hopsworks.ai. You can also install open-source Hopsworks or view tutorial videos here.
Visualization - What-if-Tool (Jupyter Classic)
What-If Tool in Jupyter Classic (does not work on Jupyterlab)
WARNING: This notebook only runs on “classic” Jupyter, not on Jupyterlab.
This notebook shows use of the What-If Tool inside of a jupyter notebook.
This notebook trains a linear classifier on the UCI census problem (predicting whether a person earns more than $50K from their census information).
It then visualizes the results of the trained classifier on test data using the What-If Tool.
#@title Define helper functions {display-mode: "form"}
import pandas as pd
import numpy as np
import tensorflow as tf
import functools
# Creates a tf feature spec from the dataframe and columns specified.
def create_feature_spec(df, columns=None):
feature_spec = {}
if columns == None:
columns = df.columns.values.tolist()
for f in columns:
if df[f].dtype is np.dtype(np.int64):
feature_spec[f] = tf.io.FixedLenFeature(shape=(), dtype=tf.int64)
elif df[f].dtype is np.dtype(np.float64):
feature_spec[f] = tf.io.FixedLenFeature(shape=(), dtype=tf.float32)
else:
feature_spec[f] = tf.io.FixedLenFeature(shape=(), dtype=tf.string)
return feature_spec
# Creates simple numeric and categorical feature columns from a feature spec and a
# list of columns from that spec to use.
#
# NOTE: Models might perform better with some feature engineering such as bucketed
# numeric columns and hash-bucket/embedding columns for categorical features.
def create_feature_columns(columns, feature_spec):
ret = []
for col in columns:
if feature_spec[col].dtype is tf.int64 or feature_spec[col].dtype is tf.float32:
ret.append(tf.feature_column.numeric_column(col))
else:
ret.append(tf.feature_column.indicator_column(
tf.feature_column.categorical_column_with_vocabulary_list(col, list(df[col].unique()))))
return ret
# An input function for providing input to a model from tf.Examples
def tfexamples_input_fn(examples, feature_spec, label, mode=tf.estimator.ModeKeys.EVAL,
num_epochs=None,
batch_size=64):
def ex_generator():
for i in range(len(examples)):
yield examples[i].SerializeToString()
dataset = tf.data.Dataset.from_generator(
ex_generator, tf.dtypes.string, tf.TensorShape([]))
if mode == tf.estimator.ModeKeys.TRAIN:
dataset = dataset.shuffle(buffer_size=2 * batch_size + 1)
dataset = dataset.batch(batch_size)
dataset = dataset.map(lambda tf_example: parse_tf_example(tf_example, label, feature_spec))
dataset = dataset.repeat(num_epochs)
return dataset
# Parses Tf.Example protos into features for the input function.
def parse_tf_example(example_proto, label, feature_spec):
parsed_features = tf.io.parse_example(serialized=example_proto, features=feature_spec)
target = parsed_features.pop(label)
return parsed_features, target
# Converts a dataframe into a list of tf.Example protos.
def df_to_examples(df, columns=None):
examples = []
if columns == None:
columns = df.columns.values.tolist()
for index, row in df.iterrows():
example = tf.train.Example()
for col in columns:
if df[col].dtype is np.dtype(np.int64):
example.features.feature[col].int64_list.value.append(int(row[col]))
elif df[col].dtype is np.dtype(np.float64):
example.features.feature[col].float_list.value.append(row[col])
elif row[col] == row[col]:
example.features.feature[col].bytes_list.value.append(row[col].encode('utf-8'))
examples.append(example)
return examples
# Converts a dataframe column into a column of 0's and 1's based on the provided test.
# Used to force label columns to be numeric for binary classification using a TF estimator.
def make_label_column_numeric(df, label_column, test):
df[label_column] = np.where(test(df[label_column]), 1, 0)
#@title Read training dataset from CSV {display-mode: "form"}
import pandas as pd
import hops.hdfs as hdfs
# Set the path to the CSV containing the dataset to train on.
# csv_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
# Set the column names for the columns in the CSV. If the CSV's first line is a header line containing
# the column names, then set this to None.
csv_columns = [
"Age", "Workclass", "fnlwgt", "Education", "Education-Num", "Marital-Status",
"Occupation", "Relationship", "Race", "Sex", "Capital-Gain", "Capital-Loss",
"Hours-per-week", "Country", "Over-50K"]
# Read the dataset from the provided CSV and print out information about it.
h = hdfs.get_fs()
with h.open_file(hdfs.project_path() + "/TourData/census/adult.data", "rt") as trainFile:
df = pd.read_csv(trainFile, names=csv_columns, skipinitialspace=True)
# df = pd.read_csv(csv_path, names=csv_columns, skipinitialspace=True)
df
Age | Workclass | fnlwgt | Education | Education-Num | Marital-Status | Occupation | Relationship | Race | Sex | Capital-Gain | Capital-Loss | Hours-per-week | Country | Over-50K | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 39 | State-gov | 77516 | Bachelors | 13 | Never-married | Adm-clerical | Not-in-family | White | Male | 2174 | 0 | 40 | United-States | <=50K |
1 | 50 | Self-emp-not-inc | 83311 | Bachelors | 13 | Married-civ-spouse | Exec-managerial | Husband | White | Male | 0 | 0 | 13 | United-States | <=50K |
2 | 38 | Private | 215646 | HS-grad | 9 | Divorced | Handlers-cleaners | Not-in-family | White | Male | 0 | 0 | 40 | United-States | <=50K |
3 | 53 | Private | 234721 | 11th | 7 | Married-civ-spouse | Handlers-cleaners | Husband | Black | Male | 0 | 0 | 40 | United-States | <=50K |
4 | 28 | Private | 338409 | Bachelors | 13 | Married-civ-spouse | Prof-specialty | Wife | Black | Female | 0 | 0 | 40 | Cuba | <=50K |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
32556 | 27 | Private | 257302 | Assoc-acdm | 12 | Married-civ-spouse | Tech-support | Wife | White | Female | 0 | 0 | 38 | United-States | <=50K |
32557 | 40 | Private | 154374 | HS-grad | 9 | Married-civ-spouse | Machine-op-inspct | Husband | White | Male | 0 | 0 | 40 | United-States | >50K |
32558 | 58 | Private | 151910 | HS-grad | 9 | Widowed | Adm-clerical | Unmarried | White | Female | 0 | 0 | 40 | United-States | <=50K |
32559 | 22 | Private | 201490 | HS-grad | 9 | Never-married | Adm-clerical | Own-child | White | Male | 0 | 0 | 20 | United-States | <=50K |
32560 | 52 | Self-emp-inc | 287927 | HS-grad | 9 | Married-civ-spouse | Exec-managerial | Wife | White | Female | 15024 | 0 | 40 | United-States | >50K |
32561 rows × 15 columns
#@title Specify input columns and column to predict {display-mode: "form"}
import numpy as np
# Set the column in the dataset you wish for the model to predict
label_column = 'Over-50K'
# Make the label column numeric (0 and 1), for use in our model.
# In this case, examples with a target value of '>50K' are considered to be in
# the '1' (positive) class and all other examples are considered to be in the
# '0' (negative) class.
make_label_column_numeric(df, label_column, lambda val: val == '>50K')
# Set list of all columns from the dataset we will use for model input.
input_features = [
'Age', 'Workclass', 'Education', 'Marital-Status', 'Occupation',
'Relationship', 'Race', 'Sex', 'Capital-Gain', 'Capital-Loss',
'Hours-per-week', 'Country']
# Create a list containing all input features and the label column
features_and_labels = input_features + [label_column]
#@title Convert dataset to tf.Example protos {display-mode: "form"}
examples = df_to_examples(df)
#@title Create and train the classifier {display-mode: "form"}
num_steps = 1000 #@param {type: "number"}
# Create a feature spec for the classifier
feature_spec = create_feature_spec(df, features_and_labels)
# Define and train the classifier
train_inpf = functools.partial(tfexamples_input_fn, examples, feature_spec, label_column)
classifier = tf.estimator.LinearClassifier(
feature_columns=create_feature_columns(input_features, feature_spec))
classifier.train(train_inpf, steps=num_steps)
INFO:tensorflow:Using default config.
WARNING:tensorflow:Using temporary folder as model directory: /tmp/tmphg54156d
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmphg54156d', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
rewrite_options {
meta_optimizer_iterations: ONE
}
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
WARNING:tensorflow:From /srv/hops/anaconda/envs/theenv/lib/python3.6/site-packages/tensorflow/python/training/training_util.py:236: Variable.initialized_value (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version.
Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
INFO:tensorflow:Calling model_fn.
WARNING:tensorflow:From /srv/hops/anaconda/envs/theenv/lib/python3.6/site-packages/tensorflow_estimator/python/estimator/canned/linear.py:1481: Layer.add_variable (from tensorflow.python.keras.engine.base_layer_v1) is deprecated and will be removed in a future version.
Instructions for updating:
Please use `layer.add_weight` method instead.
WARNING:tensorflow:From /srv/hops/anaconda/envs/theenv/lib/python3.6/site-packages/tensorflow/python/keras/optimizer_v2/ftrl.py:112: calling Constant.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 0...
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmphg54156d/model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 0...
INFO:tensorflow:loss = 0.6931472, step = 0
INFO:tensorflow:global_step/sec: 33.0106
INFO:tensorflow:loss = 1.2316809, step = 100 (3.032 sec)
INFO:tensorflow:global_step/sec: 35.9212
INFO:tensorflow:loss = 2.1111045, step = 200 (2.783 sec)
INFO:tensorflow:global_step/sec: 36.3198
INFO:tensorflow:loss = 1.3279744, step = 300 (2.754 sec)
INFO:tensorflow:global_step/sec: 35.3992
INFO:tensorflow:loss = 2.732805, step = 400 (2.825 sec)
INFO:tensorflow:global_step/sec: 34.4937
INFO:tensorflow:loss = 0.6547704, step = 500 (2.899 sec)
INFO:tensorflow:global_step/sec: 36.1927
INFO:tensorflow:loss = 0.5864808, step = 600 (2.762 sec)
INFO:tensorflow:global_step/sec: 35.5821
INFO:tensorflow:loss = 0.29896152, step = 700 (2.811 sec)
INFO:tensorflow:global_step/sec: 35.4071
INFO:tensorflow:loss = 2.0995965, step = 800 (2.827 sec)
INFO:tensorflow:global_step/sec: 34.6815
INFO:tensorflow:loss = 0.96475744, step = 900 (2.881 sec)
INFO:tensorflow:Calling checkpoint listeners before saving checkpoint 1000...
INFO:tensorflow:Saving checkpoints for 1000 into /tmp/tmphg54156d/model.ckpt.
INFO:tensorflow:Calling checkpoint listeners after saving checkpoint 1000...
INFO:tensorflow:Loss for final step: 0.6493055.
<tensorflow_estimator.python.estimator.canned.linear.LinearClassifierV2 at 0x7f0628022f28>
#@title Invoke What-If Tool for test data and the trained model {display-mode: "form"}
num_datapoints = 2000 #@param {type: "number"}
tool_height_in_px = 1000 #@param {type: "number"}
from witwidget.notebook.visualization import WitConfigBuilder
from witwidget.notebook.visualization import WitWidget
# Load up the test dataset
#test_csv_path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'
#test_df = pd.read_csv(test_csv_path, names=csv_columns, skipinitialspace=True,
# skiprows=1)
h = hdfs.get_fs()
with h.open_file(hdfs.project_path() + "/TourData/census/adult.test", "rt") as testFile:
test_df = pd.read_csv(testFile, names=csv_columns, skipinitialspace=True, skiprows=1)
make_label_column_numeric(test_df, label_column, lambda val: val == '>50K.')
test_examples = df_to_examples(test_df[0:num_datapoints])
# Setup the tool with the test examples and the trained classifier
config_builder = WitConfigBuilder(test_examples).set_estimator_and_feature_spec(
classifier, feature_spec).set_label_vocab(['Under 50K', 'Over 50K'])
WitWidget(config_builder, height=tool_height_in_px)
WitWidget(config={'model_type': 'classification', 'label_vocab': ['Under 50K', 'Over 50K'], 'are_sequence_exam…