CIS565-Fall-2021 · Scoutydren · Jan 16, 2021 · Sep 10, 2021 · Sep 10, 2021 · Sep 18, 2021
diff --git a/.gitignore b/.gitignore
@@ -23,7 +23,8 @@ build
 .LSOverride
 
 # Icon must end with two \r
-Icon
+Icon
+
 
 # Thumbnails
 ._*
@@ -558,3 +559,4 @@ xcuserdata
 *.xccheckout
 *.moved-aside
 *.xcuserstate
+.vscode/settings.json
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -86,15 +86,33 @@ set(sources
     src/utilities.cpp
     )
 
+set(imgui
+    imgui/imconfig.h
+    imgui/imgui.cpp
+    imgui/imgui.h
+    imgui/imgui_draw.cpp
+    imgui/imgui_internal.h
+    imgui/imgui_widgets.cpp
+    imgui/imgui_demo.cpp
+    imgui/imgui_impl_glfw.cpp
+    imgui/imgui_impl_glfw.h
+    imgui/imgui_impl_opengl2.cpp
+    imgui/imgui_impl_opengl2.h
+    imgui/imgui_impl_opengl3.cpp
+    imgui/imgui_impl_opengl3.h
+    )
+
 list(SORT headers)
 list(SORT sources)
+list(SORT imgui)
 
 source_group(Headers FILES ${headers})
 source_group(Sources FILES ${sources})
+source_group(imgui FILES ${imgui})
 
 #add_subdirectory(stream_compaction)  # TODO: uncomment if using your stream compaction
 
-cuda_add_executable(${CMAKE_PROJECT_NAME} ${sources} ${headers})
+cuda_add_executable(${CMAKE_PROJECT_NAME} ${sources} ${headers} ${imgui})
 target_link_libraries(${CMAKE_PROJECT_NAME}
     ${LIBRARIES}
     #stream_compaction  # TODO: uncomment if using your stream compaction

diff --git a/INSTRUCTION.md b/INSTRUCTION.md
diff --git a/README.md b/README.md
@@ -1,13 +1,195 @@
+
+
+# CUDA Denoiser 
+
+**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 4**
+
+* **Name:**  Beini Gu
+  * [LinkedIn](https://www.linkedin.com/in/rgu/), [personal website](https://www.seas.upenn.edu/~gubeini/), [twitter](https://twitter.com/scoutydren), etc.
+* Tested on: Windows 10, AMD Ryzen 7 3700X 8-Core Processor 3.59 GHz 16GB, GTX 2060
+
+## Project Description
+
+This is a CUDA-based denoiser for path tracing. The denoising algorithm in use the Edge Avoiding A-Trous Wavelet Transform. This denoiser reduces the number of iteration steps for the Monte Carlo sampling path tracer. I integrated the denoiser into the previous CUDA path tracer project with added GUI sliders and parameters with the Dear Imgui library. 
+
+## Denoising Effects
+
+* Cornell ceiling light demo
+
+| Denoising On       | Denoising Off       |
+| ------------------ | ------------------- |
+| ![off](img/on.png) | ![off](img/off.png) |
+
+* G buffers (custom scene demo)
+
+| Normal Buffer                | Position Buffer            |
+| ---------------------------- | -------------------------- |
+| ![norm](img/norm_buffer.png) | ![pos](img/pos_buffer.png) |
+
+## Performance Analysis
+
+### Time taken w/w.o. denoising
+
+![comp](img/time_compare.png)
+
+* The conclusion for this is the denoising does not add significant overhead to the rendering. Adding denoising adds 15% of the original time. The performance difference between rendering with denoising and rendering without denoising is almost negligible. 
+
+### Number of iterations taken for image to be "smooth"
+
+| With Denoising                                      | Without Denoising                                    |
+| --------------------------------------------------- | ---------------------------------------------------- |
+| ![image-20211025011136397](img/35_iters_needed.png) | ![image-20211025011136397](img/500_iters_needed.png) |
+
+* The left image: With added denoising, the image looks smooth when it only has 35 iterations. However, on the right image, without added denoising, the image needs 500 iterations to look "smooth" enough.
+
+### Time taken for denoising with increasing resolution
+
+![img](img/iteration_compare.png)
+
+* The time taken for denoised rendering increases linearly as resolution increases. 
+
+### Time taken for denoising with increasing filter size
+
+```c++
+    int atrou_iter = glm::floor(log2((filtersize - 5) / 4.f)) + 1;
+```
+
+* The number of iterations that the A-trous denoising kernel is calculated as above using filterSize. The below illustration shows why it's calculated this way by taking the integer log2 of (filtersize - 5) / 4  plus 1. 
+
+  ![illus](img/illus.png)
+
+* The time increases with increasing filter size, but not in a linear pattern. Simply because the  time increases along with the iterations, not with the filtersize. And filtersize increase results in Log(n) increase for the number of iterations. It is shown in the below graph. 
+
+![img](img/filtersize.png)
+
+### Image Quality with increasing filter size
+
+* For a image resolution with 800 * 800
+
+| FilterSize = 20          | FilterSize = 100          | FilterSize = 200           |
+| ------------------------ | ------------------------- | -------------------------- |
+| ![20](img/filter_20.png) | ![20](img/filter_100.png) | ![500](img/filter_200.png) |
+
+* **Small filter size: ** resulting in worley-looking noises in the image 
+* **Medium filter size: ** works best
+* **Big filter size: ** smoothing too much, in the rightmost image where filtersize = 200. The shadow casted is much smaller compared to the middle because it gets smoothed out. 
+
+### Material Type comparison
+
+| Perfectly Specular       | Refractive material (glass) |
+| ------------------------ | --------------------------- |
+| ![img](img/specular.png) | ![img](img/refract.png)     |
+
+* The left image shows a perfectly specular sphere after denoising. This is a good result since it perserves  the quality of the material
+* The right image shows the refractive glass material, but it is not smoothed out nicely. It loses some quality of the refractive material. 
+* For perfectly diffuse materials, it works the most nicely (On the walls)
+
 CUDA Path Tracer
 ================
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 3**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* **Name:**  Beini Gu
+* [LinkedIn](https://www.linkedin.com/in/rgu/), [personal website](https://www.seas.upenn.edu/~gubeini/), [twitter](https://twitter.com/scoutydren), etc.
+* Tested on: Windows 10, AMD Ryzen 7 3700X 8-Core Processor 3.59 GHz 16GB, GTX 2060
+
+## Project Description
+
+This is a CUDA-based path tracer with global illumination able to render scenes utilizing the parallel computing power of GPU. 
+
+![bunny](img/main.png)
+
+## Table of Contents
+
+[Features Overview](#overview-features)   
+[Miscellaneous](#misc)   
+[Performance Analysis](#performance-analysis)   
+[Bloopers](#bloopers)
+
+[Reference](#Reference)
+
+<a name="overview-features"/> 
+
+## Features Overview
+
+#### Obj Mesh Loading
+
+I use the [tinyObj](https://github.com/tinyobjloader/tinyobjloader) library for parsing and loading the custom triangulated meshes in obj format. Using the VBO data to get the position of the triangle vertices, calculate the normals and calculate their intersections with rays. 
+
+To optimize performance and reduce number of ray-triangle intersection tests, I use a bounding box for the mesh which first checks if the ray hits anywhere inside the bounding box volume in order to reduce unnecessary rays which intersects somewhere outside the bounding box. 
+
+
+| Bunny                   | Cow                 |
+| ----------------------- | ------------------- |
+| ![bunny](img/bunny.png) | ![cow](img/cow.png) |
+
+#### Refraction
+
+I implement refractive materials using Schlick's approximation and Snell's Law. This is how it looks
+
+* Glass caustics on the floor (IOR = 1.52)
+
+![no_dof](img/refraction.png)
+
+#### Depth of Field
+
+I implement a togglable depth of field effect which simulates a realistic camera with focal distance and lens radius. 
+
+| Without Depth of Field    | With Depth of Field (Focal distance = 10.0) | With Depth of Field (Focal Distance = 5.0) |
+| ------------------------- | ------------------------------------------- | ------------------------------------------ |
+| ![no_dof](img/no_dof.png) | ![dof_10](img/dof_10.png)                   | ![dof_10](img/dof_5.png)                   |
+
+#### Anti-aliasing
+
+| Without Anti-Aliasing                    | With Anti-Aliasing                         |
+| ---------------------------------------- | ------------------------------------------ |
+| ![no_dof](img/no_anti_aliasing_demo.png) | ![no_dof](img/with_anti_aliasing_demo.png) |
+
+#### Stratified and Halton Sampling
+
+I implemented two other hemisphere sampling methods: stratified and Halton sampling. Under larger numbers of iterations, there won't be a lot of difference. 
+
+| Random Sampling (iters = 20) | Stratified Sampling (iters = 20) | Halton Sampling (iters = 20) |
+| ---------------------------- | -------------------------------- | ---------------------------- |
+| ![no_dof](img/random.png)    | ![no_dof](img/stratified.png)    | ![no_dof](img/halton.png)    |
+
+Halton sampling is quasi-random number generation, it leaves a pattern on the render when sample number is small. (The above image of Halton Sampling uses 1000 as the sample number, while stratified sampling uses 100 as the sample number. )
+
+| Halton (Sequence length = 100) | Halton (Sequence length = 500) | Halton (Sequence length = 1000) |
+| ------------------------------ | ------------------------------ | ------------------------------- |
+| ![no_dof](img/halton_100.png)  | ![no_dof](img/halton_500.png)  | ![no_dof](img/halton_1000.png)  |
+
+<a name="misc"/> 
+
+## Miscellaneous
+
+I implemented material sort using thrust to improve the performance of path tracer. In addition, there is a togglable option for caching the first intersections (does not work together with anti-aliasing). 
+
+<a name="performance-analysis"/> 
+## Performance Analysis
+
+#### Mesh Culling
+
+Applying mesh culling gives a performance advantage compared to no bounding volume checks. 
+
+The following performance is tested rendering for the profile main scene. It's a 1.6 times performance increase. 
+
+![boundcheck1](img/boundcheck_comp.png)
+
+For the mesh only scene of the bunny render, the performance is also measured. We can see the advantage is less significant. Since in the previous render, the rabbit is smaller in the entire render, using a bounding box to bound that smaller region gives much more advantage. 
+
+![Snipaste_2021-10-06_13-04-02](img/boundcheck2.png)
+
+<a name="blooper"/> 
+
+## Bloopers
+
+| Normal Error & Distance calculation error                    | Refraction calculation error                                 | Randomness error                                             |
+| ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
+| ![Snipaste_2021-10-06_13-04-02](img/Snipaste_2021-10-06_13-04-02.png) | ![Snipaste_2021-10-03_18-55-19](img/Snipaste_2021-10-03_18-55-19.png) | ![Snipaste_2021-10-08_21-11-44](img/Snipaste_2021-10-08_21-11-44.png) |
 
-### (TODO: Your README)
+## Reference
 
-*DO NOT* leave the README to the last minute! It is a crucial part of the
-project, and we will not be able to grade you without a good README.
+* PBRT book [Physically Based Rendering: From Theory to Practice](https://www.pbr-book.org/3ed-2018/contents)
+* en.wikipedia.org/wiki/Halton_sequence
 
diff --git a/cmake/CUDAComputesList.cmake b/cmake/CUDAComputesList.cmake
@@ -60,6 +60,8 @@ IF(    CUDA_COMPUTE_20
     OR CUDA_COMPUTE_70
     OR CUDA_COMPUTE_72
     OR CUDA_COMPUTE_75
+    OR CUDA_COMPUTE_80
+    OR CUDA_COMPUTE_86
     )
     SET(FALLBACK OFF)
 ELSE()
@@ -70,8 +72,8 @@ LIST(LENGTH COMPUTES_DETECTED_LIST COMPUTES_LEN)
 IF(${COMPUTES_LEN} EQUAL 0 AND ${FALLBACK})
     MESSAGE(STATUS "You can use -DCOMPUTES_DETECTED_LIST=\"AB;XY\" (semicolon separated list of CUDA Compute versions to enable the specified computes")
     MESSAGE(STATUS "Individual compute versions flags are also available under CMake Advance options")
-    LIST(APPEND COMPUTES_DETECTED_LIST "30" "50" "60" "70")
-    MESSAGE(STATUS "No computes detected. Fall back to 30, 50, 60 70")
+    LIST(APPEND COMPUTES_DETECTED_LIST "30" "50" "60" "70" "80")
+    MESSAGE(STATUS "No computes detected. Fall back to 30, 50, 60, 70, 80")
 ENDIF()
 
 LIST(LENGTH COMPUTES_DETECTED_LIST COMPUTES_LEN)
@@ -90,7 +92,7 @@ MACRO(SET_COMPUTE VERSION)
 ENDMACRO(SET_COMPUTE)
 
 # Iterate over compute versions. Create variables and enable computes if needed
-FOREACH(VER 20 30 32 35 37 50 52 53 60 61 62 70 72 75)
+FOREACH(VER 20 30 32 35 37 50 52 53 60 61 62 70 72 75 80 86)
     OPTION(CUDA_COMPUTE_${VER} "CUDA Compute Capability ${VER}" OFF)
     MARK_AS_ADVANCED(CUDA_COMPUTE_${VER})
     IF(${CUDA_COMPUTE_${VER}})

diff --git a/cmake/FindGLFW.cmake b/cmake/FindGLFW.cmake
@@ -20,66 +20,66 @@
 include(FindPackageHandleStandardArgs)
 
 if (WIN32)
-	# Find include files
-	find_path(
-		GLFW_INCLUDE_DIR
-		NAMES GLFW/glfw3.h
-		PATHS
-		$ENV{PROGRAMFILES}/include
-		${GLFW_ROOT_DIR}/include
-		DOC "The directory where GLFW/glfw.h resides")
+  # Find include files
+  find_path(
+    GLFW_INCLUDE_DIR
+    NAMES GLFW/glfw3.h
+    PATHS
+    $ENV{PROGRAMFILES}/include
+    ${GLFW_ROOT_DIR}/include
+    DOC "The directory where GLFW/glfw.h resides")
 
-	# Use glfw3.lib for static library
-	if (GLFW_USE_STATIC_LIBS)
-		set(GLFW_LIBRARY_NAME glfw3)
-	else()
-		set(GLFW_LIBRARY_NAME glfw3dll)
-	endif()
+  # Use glfw3.lib for static library
+  if (GLFW_USE_STATIC_LIBS)
+    set(GLFW_LIBRARY_NAME glfw3)
+  else()
+    set(GLFW_LIBRARY_NAME glfw3dll)
+  endif()
 
-	# Find library files
-	find_library(
-		GLFW_LIBRARY
-		NAMES ${GLFW_LIBRARY_NAME}
-		PATHS
-		$ENV{PROGRAMFILES}/lib
-		${GLFW_ROOT_DIR}/lib)
+  # Find library files
+  find_library(
+    GLFW_LIBRARY
+    NAMES ${GLFW_LIBRARY_NAME}
+    PATHS
+    $ENV{PROGRAMFILES}/lib
+    ${GLFW_ROOT_DIR}/lib)
 
-	unset(GLFW_LIBRARY_NAME)
+  unset(GLFW_LIBRARY_NAME)
 else()
-	# Find include files
-	find_path(
-		GLFW_INCLUDE_DIR
-		NAMES GLFW/glfw.h
-		PATHS
-		/usr/include
-		/usr/local/include
-		/sw/include
-		/opt/local/include
-		DOC "The directory where GL/glfw.h resides")
+  # Find include files
+  find_path(
+    GLFW_INCLUDE_DIR
+    NAMES GLFW/glfw.h
+    PATHS
+    /usr/include
+    /usr/local/include
+    /sw/include
+    /opt/local/include
+    DOC "The directory where GL/glfw.h resides")
 
-	# Find library files
-	# Try to use static libraries
-	find_library(
-		GLFW_LIBRARY
-		NAMES glfw3
-		PATHS
-		/usr/lib64
-		/usr/lib
-		/usr/local/lib64
-		/usr/local/lib
-		/sw/lib
-		/opt/local/lib
-		${GLFW_ROOT_DIR}/lib
-		DOC "The GLFW library")
+  # Find library files
+  # Try to use static libraries
+  find_library(
+    GLFW_LIBRARY
+    NAMES glfw3
+    PATHS
+    /usr/lib64
+    /usr/lib
+    /usr/local/lib64
+    /usr/local/lib
+    /sw/lib
+    /opt/local/lib
+    ${GLFW_ROOT_DIR}/lib
+    DOC "The GLFW library")
 endif()
 
 # Handle REQUIRD argument, define *_FOUND variable
 find_package_handle_standard_args(GLFW DEFAULT_MSG GLFW_INCLUDE_DIR GLFW_LIBRARY)
 
 # Define GLFW_LIBRARIES and GLFW_INCLUDE_DIRS
 if (GLFW_FOUND)
-	set(GLFW_LIBRARIES ${OPENGL_LIBRARIES} ${GLFW_LIBRARY})
-	set(GLFW_INCLUDE_DIRS ${GLFW_INCLUDE_DIR})
+  set(GLFW_LIBRARIES ${OPENGL_LIBRARIES} ${GLFW_LIBRARY})
+  set(GLFW_INCLUDE_DIRS ${GLFW_INCLUDE_DIR})
 endif()
 
 # Hide some variables