Skip to content

Commit

Permalink
Merge pull request #41 from LaTeleScop/develop
Browse files Browse the repository at this point in the history
Single configurable Dockerfile with multi-stage build + external bazel cache
  • Loading branch information
remicres authored Jan 25, 2021
2 parents 3ecf418 + c2731bc commit 702dfe2
Show file tree
Hide file tree
Showing 22 changed files with 488 additions and 1,446 deletions.
2 changes: 2 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.git
python/__pycache__
167 changes: 167 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
##### Configurable Dockerfile with multi-stage build - Author: Vincent Delbar
## Mandatory
ARG BASE_IMG

# ----------------------------------------------------------------------------
# Init base stage - will be cloned as intermediate build env
FROM $BASE_IMG AS otbtf-base
WORKDIR /tmp

### System packages
COPY tools/docker/build-deps-*.txt ./
ARG DEBIAN_FRONTEND=noninteractive
# CLI
RUN apt-get update -y && apt-get upgrade -y \
&& cat build-deps-cli.txt | xargs apt-get install --no-install-recommends -y \
&& apt-get clean && rm -rf /var/lib/apt/lists/*
# Optional GUI
ARG GUI=false
RUN if $GUI; then \
apt-get update -y \
&& cat build-deps-gui.txt | xargs apt-get install --no-install-recommends -y \
&& apt-get clean && rm -rf /var/lib/apt/lists/* ; fi

### Python3 links and pip packages
RUN ln -s /usr/bin/python3 /usr/local/bin/python && ln -s /usr/bin/pip3 /usr/local/bin/pip
# NumPy version is conflicting with system's gdal dep and may require venv
ARG NUMPY_SPEC="~=1.19"
RUN pip install --no-cache-dir -U pip wheel mock six future "numpy$NUMPY_SPEC" \
&& pip install --no-cache-dir --no-deps keras_applications keras_preprocessing

# ----------------------------------------------------------------------------
# Tmp builder stage - dangling cache should persist until "docker builder prune"
FROM otbtf-base AS builder
# A smaller value may be required to avoid OOM errors when building OTB GUI
ARG CPU_RATIO=1

RUN mkdir -p /src/tf /opt/otbtf/bin /opt/otbtf/include /opt/otbtf/lib
WORKDIR /src/tf

RUN git config --global advice.detachedHead false

### TF
ARG TF=v2.4.1
# Install bazelisk (will read .bazelversion and download the right bazel binary - latest by default)
RUN wget -qO /opt/otbtf/bin/bazelisk https://github.com/bazelbuild/bazelisk/releases/latest/download/bazelisk-linux-amd64 \
&& chmod +x /opt/otbtf/bin/bazelisk \
&& ln -s /opt/otbtf/bin/bazelisk /opt/otbtf/bin/bazel

ARG BZL_TARGETS="//tensorflow:libtensorflow_cc.so //tensorflow/tools/pip_package:build_pip_package"
# "--config=opt" will enable 'march=native' (otherwise edit CC_OPT_FLAGS in build-env-tf.sh)
ARG BZL_CONFIGS="--config=nogcp --config=noaws --config=nohdfs --config=opt"
# "--compilation_mode opt" is already enabled by default (see tf repo .bazelrc and configure.py)
ARG BZL_OPTIONS="--verbose_failures --remote_cache=http://localhost:9090"

# Build
ARG KEEP_SRC_TF=false
COPY tools/docker/build-env-tf.sh ./
RUN git clone --single-branch -b $TF https://github.com/tensorflow/tensorflow.git \
&& cd tensorflow \
&& export PATH=$PATH:/opt/otbtf/bin \
&& export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/otbtf/lib \
&& bash -c '\
source ../build-env-tf.sh \
&& ./configure \
&& export TMP=/tmp/bazel \
&& BZL_CMD="build $BZL_TARGETS $BZL_CONFIGS $BZL_OPTIONS" \
&& bazel $BZL_CMD --jobs="HOST_CPUS*$CPU_RATIO" ' \
# Installation - split here if you want to check files ^
#RUN cd tensorflow \
&& ./bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg \
&& pip3 install --no-cache-dir --prefix=/opt/otbtf /tmp/tensorflow_pkg/tensorflow*.whl \
&& ln -s /opt/otbtf/lib/python3.* /opt/otbtf/lib/python3 \
&& cp -P bazel-bin/tensorflow/libtensorflow_cc.so* /opt/otbtf/lib/ \
&& ln -s $(find /opt/otbtf -type d -wholename "*/site-packages/tensorflow/include") /opt/otbtf/include/tf \
# The only missing header in the wheel
&& cp tensorflow/cc/saved_model/tag_constants.h /opt/otbtf/include/tf/tensorflow/cc/saved_model/ \
# Symlink external libs (required for MKL - libiomp5)
&& for f in $(find -L /opt/otbtf/include/tf -wholename "*/external/*/*.so"); do ln -s $f /opt/otbtf/lib/; done \
# Cleaning
&& rm -rf bazel-* \
&& ( $KEEP_SRC_TF || rm -rf /src/tf ) \
&& rm -rf /root/.cache/ /tmp/*

### OTB
ARG GUI=false
ARG OTB=7.2.0

RUN mkdir /src/otb
WORKDIR /src/otb

# SuperBuild OTB
COPY tools/docker/build-flags-otb.txt ./
RUN git clone --single-branch -b $OTB https://gitlab.orfeo-toolbox.org/orfeotoolbox/otb.git \
&& mkdir -p build \
&& cd build \
# Set GL/Qt build flags
&& if $GUI; then \
sed -i -r "s/-DOTB_USE_(QT|OPENGL|GL[UFE][WT])=OFF/-DOTB_USE_\1=ON/" ../build-flags-otb.txt; fi \
# Possible ENH: superbuild-all-dependencies switch, with separated build-deps-minimal.txt and build-deps-otbcli.txt)
#&& if $OTB_SUPERBUILD_ALL; then sed -i -r "s/-DOTB_USE_SYSTEM_([A-Z0-9]*)=ON/-DOTB_USE_SYSTEM_\1=OFF/"" ../build-flags-otb.txt; fi \
&& OTB_FLAGS=$(cat "../build-flags-otb.txt") \
&& cmake ../otb/SuperBuild -DCMAKE_INSTALL_PREFIX=/opt/otbtf $OTB_FLAGS \
&& make -j $(python -c "import os; print(round( os.cpu_count() * $CPU_RATIO ))")

### OTBTF - copy (without .git/) or clone repository
COPY . /src/otbtf
#RUN git clone https://github.com/remicres/otbtf.git /src/otbtf
RUN ln -s /src/otbtf /src/otb/otb/Modules/Remote/otbtf

# Rebuild OTB with module
ARG KEEP_SRC_OTB=false
RUN cd /src/otb/build/OTB/build \
&& export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/otbtf/lib \
&& export PATH=$PATH:/opt/otbtf/bin \
&& cmake /src/otb/otb \
-DCMAKE_INSTALL_PREFIX=/opt/otbtf \
-DOTB_WRAP_PYTHON=ON -DPYTHON_EXECUTABLE=/usr/bin/python3 \
-DOTB_USE_TENSORFLOW=ON -DModule_OTBTensorflow=ON \
-Dtensorflow_include_dir=/opt/otbtf/include/tf \
# Forcing TF>=2, this Dockerfile hasn't been tested with v1 + missing link for libtensorflow_framework.so in the wheel
-DTENSORFLOW_CC_LIB=/opt/otbtf/lib/libtensorflow_cc.so.2 \
-DTENSORFLOW_FRAMEWORK_LIB=/opt/otbtf/lib/python3/site-packages/tensorflow/libtensorflow_framework.so.2 \
&& make install -j $(python -c "import os; print(round( os.cpu_count() * $CPU_RATIO ))") \
# Cleaning
&& ( $GUI || rm -rf /opt/otbtf/bin/otbgui* ) \
&& ( $KEEP_SRC_OTB || rm -rf /src/otb ) \
&& rm -rf /root/.cache /tmp/*

# Symlink executable python files in PATH
RUN for f in /src/otbtf/python/*.py; do if [ -x $f ]; then ln -s $f /opt/otbtf/bin/; fi; done

# ----------------------------------------------------------------------------
# Final stage
FROM otbtf-base
MAINTAINER Remi Cresson <remi.cresson[at]inrae[dot]fr>

# Copy files from intermediate stage
COPY --from=builder /opt/otbtf /opt/otbtf
COPY --from=builder /src /src

# System-wide ENV
ENV PATH="/opt/otbtf/bin:$PATH"
ENV LD_LIBRARY_PATH="/opt/otbtf/lib:$LD_LIBRARY_PATH"
ENV PYTHONPATH="/opt/otbtf/lib/python3/site-packages:/opt/otbtf/lib/otb/python:/src/otbtf/python"
ENV OTB_APPLICATION_PATH="/opt/otbtf/lib/otb/applications"

# Default user, directory and command (bash is the entrypoint when using 'docker create')
RUN useradd -s /bin/bash -m otbuser
WORKDIR /home/otbuser

# Admin rights without password
ARG SUDO=true
RUN if $SUDO; then \
usermod -a -G sudo otbuser \
&& echo "otbuser ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers; fi

# Set /src/otbtf ownership to otbuser (but you still need 'sudo -i' in order to rebuild TF or OTB)
RUN chown -R otbuser:otbuser /src/otbtf

# This won't prevent ownership problems with volumes if you're not UID 1000
USER otbuser
# User-only ENV

# Test python imports
RUN python -c "import tensorflow"
RUN python -c "import otbtf, tricks"
RUN python -c "import otbApplication as otb; otb.Registry.CreateApplication('ImageClassifierFromDeepFeatures')"
10 changes: 5 additions & 5 deletions include/otbTensorflowGraphOperations.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@ namespace tf {
//
// Restore a model from a path
//
void RestoreModel(const std::string path, tensorflow::SavedModelBundle & bundle)
void RestoreModel(const tensorflow::tstring path, tensorflow::SavedModelBundle & bundle)
{
tensorflow::Tensor checkpointPathTensor(tensorflow::DT_STRING, tensorflow::TensorShape());
checkpointPathTensor.scalar<std::string>()() = path;
checkpointPathTensor.scalar<tensorflow::tstring>()() = path;
std::vector<std::pair<std::string, tensorflow::Tensor>> feed_dict =
{{bundle.meta_graph_def.saver_def().filename_tensor_name(), checkpointPathTensor}};
auto status = bundle.session->Run(feed_dict, {}, {bundle.meta_graph_def.saver_def().restore_op_name()}, nullptr);
Expand All @@ -32,10 +32,10 @@ void RestoreModel(const std::string path, tensorflow::SavedModelBundle & bundle)
//
// Restore a model from a path
//
void SaveModel(const std::string path, tensorflow::SavedModelBundle & bundle)
void SaveModel(const tensorflow::tstring path, tensorflow::SavedModelBundle & bundle)
{
tensorflow::Tensor checkpointPathTensor(tensorflow::DT_STRING, tensorflow::TensorShape());
checkpointPathTensor.scalar<std::string>()() = path;
checkpointPathTensor.scalar<tensorflow::tstring>()() = path;
std::vector<std::pair<std::string, tensorflow::Tensor>> feed_dict =
{{bundle.meta_graph_def.saver_def().filename_tensor_name(), checkpointPathTensor}};
auto status = bundle.session->Run(feed_dict, {}, {bundle.meta_graph_def.saver_def().save_tensor_name()}, nullptr);
Expand All @@ -48,7 +48,7 @@ void SaveModel(const std::string path, tensorflow::SavedModelBundle & bundle)
//
// Load a session and a graph from a folder
//
void LoadModel(const std::string path, tensorflow::SavedModelBundle & bundle)
void LoadModel(const tensorflow::tstring path, tensorflow::SavedModelBundle & bundle)
{

tensorflow::RunOptions runoptions;
Expand Down
6 changes: 3 additions & 3 deletions include/otbTensorflowGraphOperations.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,13 @@ namespace otb {
namespace tf {

// Restore a model from a path
void RestoreModel(const std::string path, tensorflow::SavedModelBundle & bundle);
void RestoreModel(const tensorflow::tstring path, tensorflow::SavedModelBundle & bundle);

// Restore a model from a path
void SaveModel(const std::string path, tensorflow::SavedModelBundle & bundle);
void SaveModel(const tensorflow::tstring path, tensorflow::SavedModelBundle & bundle);

// Load a session and a graph from a folder
void LoadModel(const std::string path, tensorflow::SavedModelBundle & bundle);
void LoadModel(const tensorflow::tstring path, tensorflow::SavedModelBundle & bundle);

// Load a graph from a .meta file
tensorflow::GraphDef LoadGraph(std::string filename);
Expand Down
1 change: 1 addition & 0 deletions python/ckpt2savedmodel.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# ==========================================================================
#
Expand Down
1 change: 1 addition & 0 deletions python/create_savedmodel_ienco-m3_patchbased.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# ==========================================================================
#
Expand Down
1 change: 1 addition & 0 deletions python/create_savedmodel_maggiori17_fullyconv.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#==========================================================================
#
Expand Down
1 change: 1 addition & 0 deletions python/create_savedmodel_pxs_fcn.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# ==========================================================================
#
Expand Down
1 change: 1 addition & 0 deletions python/create_savedmodel_simple_cnn.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# ==========================================================================
#
Expand Down
1 change: 1 addition & 0 deletions python/create_savedmodel_simple_fcn.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# ==========================================================================
#
Expand Down
124 changes: 124 additions & 0 deletions tools/docker/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
# Build with Docker
Docker build has to be called from the root of the repository (i.e. `docker build .` or `bash tools/docker/multibuild.sh`).
You can build a custom image using `--build-arg` and several config files :
- Ubuntu : `BASE_IMG` should accept any version, for additional packages see [build-deps-cli.txt](build-deps-cli.txt) and [build-deps-gui.txt](build-deps-gui.txt)
- TensorFlow : `TF` arg for the git branch or tag + [build-env-tf.sh](build-env-tf.sh) and BZL_* arguments for the build configuration
- OrfeoToolBox : `OTB` arg for the git branch or tag + [build-flags-otb.txt](build-flags-otb.txt) to edit cmake flags

### Base images
```bash
UBUNTU=20.04 # or 16.04, 18.04
CUDA=11.0.3 # or 10.1, 10.2
CUDNN=8 # or 7
IMG=ubuntu:$UBUNTU
GPU_IMG=nvidia/cuda:$CUDA-cudnn$CUDNN-devel-ubuntu$UBUNTU
```

### Default arguments
```bash
BASE_IMG # mandatory
CPU_RATIO=0.95
GUI=false
NUMPY_SPEC="~=1.19"
TF=r2.4.1
OTB=7.2.0
BZL_TARGETS="//tensorflow:libtensorflow_cc.so //tensorflow/tools/pip_package:build_pip_package"
BZL_CONFIGS="--config=nogcp --config=noaws --config=nohdfs --config=opt"
BZL_OPTIONS="--verbose_failures --remote_cache=http://localhost:9090"
KEEP_SRC_TF=false
KEEP_SRC_OTB=false
SUDO=true

# NumPy version requirement :
# TF < 2.4 : "numpy<1.19.0,>=1.16.0"
# TF >= 2.4 : "numpy~=1.19"
```

### Bazel remote cache daemon
If you just need to rebuild with different GUI or KEEP_SRC arguments, or may be a different branch of OTB, bazel cache will help you to rebuild everything except TF, even if the docker cache was purged (after `docker [system|builder] prune`).
In order to recycle the cache, bazel config and TF git tag should be exactly the same, any change in [build-env-tf.sh](build-env-tf.sh) and `--build-arg` (if related to bazel env, cuda, mkl, xla...) may result in a fresh new build.

Start a cache daemon - here with max 20GB but 12GB should be enough to save 2 TF builds (GPU and CPU):
```bash
mkdir -p $HOME/.cache/bazel-remote
docker run --detach -u 1000:1000 -v $HOME/.cache/bazel-remote:/data -p 9090:8080 buchgr/bazel-remote-cache --max_size=20
```
Then just add ` --network='host'` to the docker build command, or connect bazel to a remote server - see 'BZL_OPTIONS'.
The other way of docker is a virtual bridge, but you'll need to edit the IP address.

## Build examples
```bash
# Build for CPU using default Dockerfiles args (without AWS, HDFS or GCP support)
docker build --network='host' -t otbtf:cpu --build-arg BASE_IMG=ubuntu:20.04 .

# Clear bazel config var (deactivate default optimizations and unset noaws/nogcp/nohdfs)
docker build --network='host' -t otbtf:cpu --build-arg BASE_IMG=ubuntu:20.04 --build-arg BZL_CONFIGS= .

# Enable MKL
MKL_CONFIG="--config=nogcp --config=noaws --config=nohdfs --config=opt --config=mkl"
docker build --network='host' -t otbtf:cpu-mkl --build-arg BZL_CONFIGS="$MKL_CONFIG" --build-arg BASE_IMG=ubuntu:20.04 .

# Build for GPU (if you're building for your system only you should edit CUDA_COMPUTE_CAPABILITIES in build-env-tf.sh)
docker build --network='host' -t otbtf:gpu --build-arg BASE_IMG=nvidia/cuda:11.0.3-cudnn8-devel-ubuntu20.04 .

# Build dev with TF and OTB sources (huge image) + set git branches/tags to clone
docker build --network='host' -t otbtf:gpu-dev-full --build-arg BASE_IMG=nvidia/cuda:11.0.3-cudnn8-devel-ubuntu20.04 \
--build-arg KEEP_SRC_OTB=true --buid-arg KEEP_SRC_TF=true --build-arg TF=nightly --build-arg OTB=develop .

# Build old release
docker build --network='host' -t otbtf:oldstable-gpu --build-arg BASE_IMG=nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04 \
--build-arg TF=r2.1 --build-arg NUMPY_SPEC="<1.19" \
--build-arg BAZEL_OPTIONS="--noincompatible_do_not_split_linking_cmdline --verbose_failures --remote_cache=http://localhost:9090" .
# You could edit the Dockerfile in order to clone an old branch of the repo instead of copying files from the build context
```

### Debug build
If you fail to build, you can log into the last layer and check CMake logs. Run `docker images`, find the latest layer ID and run a tmp container (`docker run -it d60496d9612e bash`).
You may also need to split some multi-command layers in the Dockerfile.
If you see OOM errors during SuperBuild you should decrease CPU_RATIO (e.g. 0.75).

## Container examples
```bash
# Pull GPU image and create a new container with your home directory as volume (requires apt package nvidia-docker2 and CUDA>=11.0)
docker create --gpus=all --volume $HOME:/home/otbuser/volume -it --name otbtf-gpu mdl4eo/otbtf2.1:gpu

# Run interactive
docker start -i otbtf-gpu

# Run in background
docker start otbtf-gpu
docker exec otbtf-gpu python -c 'import tensorflow as tf; print(tf.test.is_gpu_available())'
```

### Rebuild OTB with more modules
```bash
docker create --gpus=all -it --name otbtf-gpu-dev mdl4eo/otbtf2.1:gpu-dev
docker start -i otbtf-gpu-dev
```
```bash
# From the container shell:
sudo -i
cd /src/otb/otb/Modules/Remote
git clone https://gitlab.irstea.fr/raffaele.gaetano/otbSelectiveHaralickTextures.git
cd /src/otb/build/OTB/build
cmake -DModule_OTBAppSelectiveHaralickTextures=ON /src/otb/otb && make install -j
```

### Container with GUI
```bash
# GUI is disabled by default in order to save space, and because docker xvfb isn't working properly with OpenGL.
# => otbgui seems OK but monteverdi isn't working
docker build --network='host' -t otbtf:cpu-gui --build-arg BASE_IMG=ubuntu:20.04 --build-arg GUI=true .
docker create -v /tmp/.X11-unix:/tmp/.X11-unix -e DISPLAY=$DISPLAY -it --name otbtf-gui otbtf:cpu-gui
docker start -i otbtf-gui
$ mapla
```

### Common errors
Buid :
`Error response from daemon: manifest for nvidia/cuda:11.0-cudnn8-devel-ubuntu20.04 not found: manifest unknown: manifest unknown`
=> Image is missing from dockerhub

Run :
`failed call to cuInit: UNKNOWN ERROR (303) / no NVIDIA GPU device is present: /dev/nvidia0 does not exist`
=> Nvidia driver is missing or disabled, make sure to add ` --gpus=all` to your docker run or create command
Loading

0 comments on commit 702dfe2

Please sign in to comment.