Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Change image segmentation example to tfds #2925

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions pyzoo/zoo/examples/orca/learn/tf/image_segmentation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,30 @@ pip install analytics_zoo-${VERSION}-${TIMESTAMP}-py2.py3-none-${OS}_x86_64.whl
Note: conda environment is required to run on Yarn, but not strictly necessary for running on local.

## Data Preparation
You should manually download the dataset from kaggle [carvana-image-masking-challenge](https://www.kaggle.com/c/carvana-image-masking-challenge/data) and save it to `/tmp/carvana/`. We will need three files, train.zip, train_masks.zip and train_masks.csv.zip
You should manually download the dataset from kaggle [carvana-image-masking-challenge](https://www.kaggle.com/c/carvana-image-masking-challenge/data) and save it to `~/tensorflow_datasets/downloads/manual/carvana`. We will need two files, train.zip and train_masks.zip

## Run example on local
```bash
python image_segmentation.py --cluster_mode local
```

## Run example on yarn cluster

If have not run image_segmention.py locally, you can run the following command to prepare tf_records file locally and put it to hdfs.
Although tensorflow_datasets support directly perparing data in hdfs, that might generate too much small files that will harm your
hdfs performance.

```bash
python image_segmentation.py --cluster_mode yarn
python carvana_datasets.py
hadoop fs -put ~/tensorflow_datasets/ /tensorflow_datasets/
```


```bash
source ${HADOOP_HOME}/libexec/hadoop-config.sh # setting HADOOP_HDFS_HOME, LD_LIBRARY_PATH, etc
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${JAVA_HOME}/jre/lib/amd64/server

CLASSPATH=$(${HADOOP_HOME}/bin/hadoop classpath --glob) python image_segmentation.py --cluster_mode yarn
```

Options
Expand Down
15 changes: 15 additions & 0 deletions pyzoo/zoo/examples/orca/learn/tf/image_segmentation/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#
# Copyright 2018 Analytics Zoo Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#
# Copyright 2018 Analytics Zoo Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import tensorflow_datasets.public_api as tfds
import tensorflow as tf
import os


class Carvana(tfds.core.GeneratorBasedBuilder):
"""Short description of my dataset."""

VERSION = tfds.core.Version('0.1.0')

MANUAL_DOWNLOAD_INSTRUCTIONS = """\
Download train.zip and train_masks.zip to {data_dir}/downloads/manual/carvana/,
{data_dir} defaults to ~/tensorflow_datasets/
"""

def _info(self):
return tfds.core.DatasetInfo(
builder=self,
# tfds.features.FeatureConnectors
features=tfds.features.FeaturesDict({
"image_description": tfds.features.Text(),
"image": tfds.features.Image(),
"mask": tfds.features.Image(shape=(None, None, None, 3)),
}),
supervised_keys=("image", "mask"))

def _split_generators(self, dl_manager):
# Download source data
train_path = os.path.join(dl_manager.manual_dir, 'train.zip')
train_mask_path = os.path.join(dl_manager.manual_dir, 'train_masks.zip')
extracted_train_path = dl_manager.extract(train_path)
extracted_train_mask_path = dl_manager.extract(train_mask_path)

# Specify the splits
return [
tfds.core.SplitGenerator(
name=tfds.Split.TRAIN,
gen_kwargs={
"train_dir_path": os.path.join(extracted_train_path, "train"),
"train_mask_dir_path": os.path.join(extracted_train_mask_path, "train_masks"),
},
),
]

def _generate_examples(self, train_dir_path, train_mask_dir_path):
# Read the input data out of the source files
data = []
import tensorflow as tf
for image_file in tf.io.gfile.listdir(train_dir_path):
image_id = image_file[:-4] # xxx.jpg
mask_file = f"{image_id}_mask.gif"
data.append((image_id, image_file, mask_file))

# And yield examples as feature dictionaries
for image_id, image, mask in data:
yield image_id, {
"image_description": image_id,
"image": os.path.join(train_dir_path, image),
"mask": os.path.join(train_mask_dir_path, mask),
}

if __name__ == "__main__":
dataset_builder = Carvana()
dataset_builder.download_and_prepare()
Original file line number Diff line number Diff line change
Expand Up @@ -32,69 +32,38 @@
from zoo.orca import init_orca_context, stop_orca_context
from zoo.orca.data import XShards
from zoo.orca.learn.tf.estimator import Estimator


def load_data_from_zip(file_path, file):
with zipfile.ZipFile(os.path.join(file_path, file), "r") as zip_ref:
unzipped_file = zip_ref.namelist()[0]
zip_ref.extractall(file_path)


def load_data(file_path):
load_data_from_zip(file_path, 'train.zip')
load_data_from_zip(file_path, 'train_masks.zip')
load_data_from_zip(file_path, 'train_masks.csv.zip')


def main(cluster_mode, max_epoch, file_path, batch_size):
if cluster_mode == "local":
init_orca_context(cluster_mode="local", cores=4, memory="3g")
elif cluster_mode == "yarn":
init_orca_context(cluster_mode="yarn-client", num_nodes=2, cores=2, driver_memory="3g")

load_data(file_path)
img_dir = os.path.join(file_path, "train")
label_dir = os.path.join(file_path, "train_masks")

# Here we only take the first 1000 files for simplicity
df_train = pd.read_csv(os.path.join(file_path, 'train_masks.csv'))
ids_train = df_train['img'].map(lambda s: s.split('.')[0])
ids_train = ids_train[:1000]

x_train_filenames = []
y_train_filenames = []
for img_id in ids_train:
x_train_filenames.append(os.path.join(img_dir, "{}.jpg".format(img_id)))
y_train_filenames.append(os.path.join(label_dir, "{}_mask.gif".format(img_id)))

x_train_filenames, x_val_filenames, y_train_filenames, y_val_filenames = \
train_test_split(x_train_filenames, y_train_filenames, test_size=0.2, random_state=42)

def load_and_process_image(path):
array = mpimg.imread(path)
result = np.array(Image.fromarray(array).resize(size=(128, 128)))
result = result.astype(float)
result /= 255.0
return result

def load_and_process_image_label(path):
array = mpimg.imread(path)
result = np.array(Image.fromarray(array).resize(size=(128, 128)))
result = np.expand_dims(result[:, :, 1], axis=-1)
result = result.astype(float)
result /= 255.0
return result

train_images = np.stack([load_and_process_image(filepath) for filepath in x_train_filenames])
train_label_images = np.stack([load_and_process_image_label(filepath)
for filepath in y_train_filenames])
val_images = np.stack([load_and_process_image(filepath) for filepath in x_val_filenames])
val_label_images = np.stack([load_and_process_image_label(filepath)
for filepath in y_val_filenames])
train_shards = XShards.partition({"x": train_images, "y": train_label_images})
val_shards = XShards.partition({"x": val_images, "y": val_label_images})

# Build the U-Net model
from zoo.examples.orca.learn.tf.image_segmentation.carvana_datasets import Carvana
import tensorflow_datasets as tfds

def preprocessing(data):
image = data['image']
mask = data['mask']
image = tf.image.resize(image, size=[128, 128]) / 255.0
mask = tf.image.rgb_to_grayscale(tf.image.resize(mask[0], size=[128, 128])) / 255.0
return image, mask

# Define custom metrics
def dice_coeff(y_true, y_pred):
smooth = 1.
# Flatten
y_true_f = tf.reshape(y_true, [-1])
y_pred_f = tf.reshape(y_pred, [-1])
intersection = tf.reduce_sum(y_true_f * y_pred_f)
score = (2. * intersection + smooth) / \
(tf.reduce_sum(y_true_f) + tf.reduce_sum(y_pred_f) + smooth)
return score

# Define custom loss function
def dice_loss(y_true, y_pred):
loss = 1 - dice_coeff(y_true, y_pred)
return loss

def bce_dice_loss(y_true, y_pred):
loss = losses.binary_crossentropy(y_true, y_pred) + dice_loss(y_true, y_pred)
return loss

def create_unet_model():
# Build the U-Net model
def conv_block(input_tensor, num_filters):
encoder = layers.Conv2D(num_filters, (3, 3), padding='same')(input_tensor)
encoder = layers.Activation('relu')(encoder)
Expand Down Expand Up @@ -132,67 +101,65 @@ def decoder_block(input_tensor, concat_tensor, num_filters):
outputs = layers.Conv2D(1, (1, 1), activation='sigmoid')(decoder0)

net = models.Model(inputs=[inputs], outputs=[outputs])

# Define custom metrics
def dice_coeff(y_true, y_pred):
smooth = 1.
# Flatten
y_true_f = tf.reshape(y_true, [-1])
y_pred_f = tf.reshape(y_pred, [-1])
intersection = tf.reduce_sum(y_true_f * y_pred_f)
score = (2. * intersection + smooth) / \
(tf.reduce_sum(y_true_f) + tf.reduce_sum(y_pred_f) + smooth)
return score

# Define custom loss function
def dice_loss(y_true, y_pred):
loss = 1 - dice_coeff(y_true, y_pred)
return loss

def bce_dice_loss(y_true, y_pred):
loss = losses.binary_crossentropy(y_true, y_pred) + dice_loss(y_true, y_pred)
return loss

# compile model
net.compile(optimizer=tf.keras.optimizers.Adam(2e-3), loss=bce_dice_loss)
print(net.summary())
return net



def main(cluster_mode, max_epoch, file_path, batch_size):
if cluster_mode == "local":
init_orca_context(cluster_mode="local", cores=4, memory="3g")
data_dir = "~/tensorflow_datasets"
elif cluster_mode == "yarn":
init_orca_context(cluster_mode="yarn-client", num_nodes=2, cores=2, driver_memory="3g")
data_dir="hdfs:///tensorflow_datasets"

dataset_builder = Carvana(data_dir=data_dir)
dataset_builder.download_and_prepare()
train_dataset = dataset_builder.as_dataset(split="train[:80%]")
test_dataset = dataset_builder.as_dataset(split="train[:-20%]")

train_dataset = train_dataset.map(preprocessing)
test_dataset = test_dataset.map(preprocessing)

# create an estimator from keras model
est = Estimator.from_keras(keras_model=net)
est = Estimator.from_keras(keras_model=create_unet_model())
# fit with estimator
est.fit(data=train_shards,
est.fit(data=train_dataset,
batch_size=batch_size,
epochs=max_epoch)
# evaluate with estimator
result = est.evaluate(val_shards)
result = est.evaluate(test_dataset)
print(result)
# predict with estimator
val_shards.cache()
val_image_shards = val_shards.transform_shard(lambda val_dict: {"x": val_dict["x"]})
pred_shards = est.predict(data=val_image_shards, batch_size=batch_size)
pred = pred_shards.collect()[0]["prediction"]
val_image_label = val_shards.collect()[0]
val_image = val_image_label["x"]
val_label = val_image_label["y"]
# visualize 5 predicted results
plt.figure(figsize=(10, 20))
for i in range(5):
img = val_image[i]
label = val_label[i]
predicted_label = pred[i]

plt.subplot(5, 3, 3 * i + 1)
plt.imshow(img)
plt.title("Input image")

plt.subplot(5, 3, 3 * i + 2)
plt.imshow(label[:, :, 0], cmap='gray')
plt.title("Actual Mask")
plt.subplot(5, 3, 3 * i + 3)
plt.imshow(predicted_label, cmap='gray')
plt.title("Predicted Mask")
plt.suptitle("Examples of Input Image, Label, and Prediction")
plt.show()
# # predict with estimator
# val_shards.cache()
# val_image_shards = val_shards.transform_shard(lambda val_dict: {"x": val_dict["x"]})
# pred_shards = est.predict(data=val_image_shards, batch_size=batch_size)
# pred = pred_shards.collect()[0]["prediction"]
# val_image_label = val_shards.collect()[0]
# val_image = val_image_label["x"]
# val_label = val_image_label["y"]
# # visualize 5 predicted results
# plt.figure(figsize=(10, 20))
# for i in range(5):
# img = val_image[i]
# label = val_label[i]
# predicted_label = pred[i]

# plt.subplot(5, 3, 3 * i + 1)
# plt.imshow(img)
# plt.title("Input image")

# plt.subplot(5, 3, 3 * i + 2)
# plt.imshow(label[:, :, 0], cmap='gray')
# plt.title("Actual Mask")
# plt.subplot(5, 3, 3 * i + 3)
# plt.imshow(predicted_label, cmap='gray')
# plt.title("Predicted Mask")
# plt.suptitle("Examples of Input Image, Label, and Prediction")
# plt.show()

stop_orca_context()

Expand Down