|
| 1 | +# Databricks notebook source |
| 2 | +# Example RDD |
| 3 | +data = [1, 2, 3, 4, 5] |
| 4 | +rdd = sc.parallelize(data) |
| 5 | + |
| 6 | +# COMMAND ---------- |
| 7 | + |
| 8 | +# MAGIC %md |
| 9 | +# MAGIC # Transformations |
| 10 | + |
| 11 | +# COMMAND ---------- |
| 12 | + |
| 13 | +# MAGIC %md |
| 14 | +# MAGIC ## `Map` Function |
| 15 | + |
| 16 | +# COMMAND ---------- |
| 17 | + |
| 18 | +# 1. map |
| 19 | +print("### 1. map ###") |
| 20 | +print("Description: Return a new RDD by applying a function to all elements of this RDD.") |
| 21 | + |
| 22 | +# Example 1: Multiply each element by 2 |
| 23 | +simple_map = rdd.map(lambda x: x * 2).collect() |
| 24 | +print("01 map example (multiply by 2):", simple_map) |
| 25 | + |
| 26 | +# Example 2: Extract the length of each word in a list of sentences |
| 27 | +sentences = ["Hello world", "Apache Spark", "RDD transformations Wide Vs Narrow Spark"] |
| 28 | +# Hello World => split (" ") => [(0)-> Hello, (1) -> World] |
| 29 | +sentence_rdd = sc.parallelize(sentences) |
| 30 | +words_map = sentence_rdd.map(lambda sentence: len(sentence.split(" "))).collect() |
| 31 | +print("example_map example (word count in sentences):", words_map) |
| 32 | + |
| 33 | +# COMMAND ---------- |
| 34 | + |
| 35 | +# MAGIC %md |
| 36 | +# MAGIC ## `Filter` Function |
| 37 | + |
| 38 | +# COMMAND ---------- |
| 39 | + |
| 40 | +# 2. filter |
| 41 | +print("\n### 2. filter ###") |
| 42 | +print("Description: Return a new RDD containing only the elements that satisfy a predicate.") |
| 43 | + |
| 44 | +# 01 Example: Filter out even numbers |
| 45 | +simple_filter = rdd.filter(lambda x: x % 2 == 0).collect() |
| 46 | +print("01 filter example (even numbers):", simple_filter) |
| 47 | + |
| 48 | +# example_Example: Filter sentences containing the word 'Spark' |
| 49 | +words_filter = sentence_rdd.filter(lambda sentence: "Spark" in sentence).collect() |
| 50 | +print("example_ filter example (sentences with 'Spark'):", words_filter) |
| 51 | + |
| 52 | +# COMMAND ---------- |
| 53 | + |
| 54 | +# MAGIC %md |
| 55 | +# MAGIC ## `FlatMap` Function |
| 56 | + |
| 57 | +# COMMAND ---------- |
| 58 | + |
| 59 | +# 3. flatMap |
| 60 | +print("\n### 3. flatMap ###") |
| 61 | +print("Description: Return a new RDD by applying a function to all elements of this RDD and then flattening the results.") |
| 62 | + |
| 63 | +# 01 Example: Split sentences into words |
| 64 | +sentences_mapped = sentence_rdd.map(lambda sentence: sentence.split(" ")).collect() |
| 65 | +print("01 sentences_mapped:", sentences_mapped) |
| 66 | + |
| 67 | +simple_flatMap = sentence_rdd.flatMap(lambda sentence: sentence.split(" ")).collect() |
| 68 | +print("02 flatMap example (split sentences into words):", simple_flatMap) |
| 69 | + |
| 70 | +# example_Example: Flatten a list of lists |
| 71 | +nested_lists = [[1, 2, 3], [4, 5], [6, 7, 8, 9]] |
| 72 | +nested_rdd = sc.parallelize(nested_lists) |
| 73 | +flatten_list = nested_rdd.flatMap(lambda x: x).collect() |
| 74 | +print("flatten_list flatMap example (flatten list of lists):", flatten_list) |
| 75 | + |
| 76 | +# COMMAND ---------- |
| 77 | + |
| 78 | +# MAGIC %md |
| 79 | +# MAGIC ## `Reduce` Function |
| 80 | + |
| 81 | +# COMMAND ---------- |
| 82 | + |
| 83 | +# 4. reduce |
| 84 | +print("\n### 4. reduce ###") |
| 85 | +print("Description: Reduces the elements of this RDD using the specified commutative and associative binary operator.") |
| 86 | + |
| 87 | +# 01 Example: Sum of elements |
| 88 | +simple_reduce = rdd.reduce(lambda x, y: x + y) |
| 89 | +print("01 reduce example (sum of elements):", simple_reduce) |
| 90 | + |
| 91 | +# example_Example: Find the longest word in a list of words |
| 92 | +words = ["cat", "elephant", "rat", "hippopotamus"] |
| 93 | +words_rdd = sc.parallelize(words) |
| 94 | +words_rdd_reduced = words_rdd.reduce(lambda x, y: x if len(x) > len(y) else y) |
| 95 | +print("reduce example (longest word):", words_rdd_reduced) |
| 96 | + |
| 97 | +# COMMAND ---------- |
| 98 | + |
| 99 | +# MAGIC %md |
| 100 | +# MAGIC ## `groupByKey` Function |
| 101 | + |
| 102 | +# COMMAND ---------- |
| 103 | + |
| 104 | +# 5. groupByKey |
| 105 | +print("\n### 5. groupByKey ###") |
| 106 | +print("Description: Group the values for each key in the RDD into a single sequence.") |
| 107 | + |
| 108 | +# 01 Example: Group numbers by even and odd |
| 109 | +pairs = [(1, 'a'),(1, 'ali'), (2, 'b'), (3, 'c'), (4, 'd'), (5, 'e')] |
| 110 | +pairs_rdd = sc.parallelize(pairs) |
| 111 | +simple_groupByKey = pairs_rdd.groupByKey().mapValues(list).collect() |
| 112 | +print("01 groupByKey example (group numbers):", simple_groupByKey) |
| 113 | + |
| 114 | +# example_Example: Group words by their starting letter |
| 115 | +words_pairs = [("cat", 1), ("car", 2), ("dog", 3), ("deer", 4), ("elephant", 5),("elephant", 20)] |
| 116 | +words_rdd = sc.parallelize(words_pairs) |
| 117 | +# mapValues(list) converts the grouped values (which are iterable) into lists. |
| 118 | +words_grouped = words_rdd.groupByKey().mapValues(list).collect() |
| 119 | +print("words_grouped example (group words by starting letter):", words_grouped) |
| 120 | + |
| 121 | +# COMMAND ---------- |
| 122 | + |
| 123 | +# MAGIC %md |
| 124 | +# MAGIC ## `reduceByKey` Function |
| 125 | + |
| 126 | +# COMMAND ---------- |
| 127 | + |
| 128 | +# 6. reduceByKey |
| 129 | +print("\n### 6. reduceByKey ###") |
| 130 | +print("Description: Merge the values for each key using an associative and commutative reduce function.") |
| 131 | +pairs = [(1, 'a'),(1, '_a'), (2, 'b'), (2, '_b'), (3, 'c'), (4, 'd'), (5, 'e')] |
| 132 | +pairs_rdd = sc.parallelize(pairs) |
| 133 | + |
| 134 | +# 01 Example: Sum values with the same key |
| 135 | +simple_reduceByKey = pairs_rdd.reduceByKey(lambda x, y: x + y).collect() |
| 136 | +print("01 reduceByKey example (sum values by key):", simple_reduceByKey) |
| 137 | + |
| 138 | +# example_Example: Count the occurrences of each word in a list |
| 139 | +word_list = ["cat", "cat", "dog", "elephant", "dog", "dog"] |
| 140 | +word_pairs_rdd = sc.parallelize(word_list).map(lambda word: (word, 1)) |
| 141 | +example__reduceByKey = word_pairs_rdd.reduceByKey(lambda x, y: x + y).collect() |
| 142 | +print("example_ reduceByKey example (word count):", example__reduceByKey) |
| 143 | + |
| 144 | +# COMMAND ---------- |
| 145 | + |
| 146 | +# MAGIC %md |
| 147 | +# MAGIC ## `join` Function |
| 148 | + |
| 149 | +# COMMAND ---------- |
| 150 | + |
| 151 | +# 7. join |
| 152 | +print("\n### 7. join ###") |
| 153 | +print("Description: Perform an inner join of this RDD and another one.") |
| 154 | + |
| 155 | +# 01 Example: Join two RDDs by key |
| 156 | +fruits = sc.parallelize([(1, "apple"), (2, "banana")]) |
| 157 | +colors = sc.parallelize([(1, "red"), (2, "yellow")]) |
| 158 | +fruits_color_join = fruits.join(colors).collect() |
| 159 | +print("01 join fruits_color_join (join two RDDs):", fruits_color_join) |
| 160 | + |
| 161 | +# example_Example: Join employee data with department data |
| 162 | +employees = sc.parallelize([(1, "John"), (2, "Jane"), (3, "Joe")]) |
| 163 | +departments = sc.parallelize([(1, "HR"), (2, "Finance")]) |
| 164 | +employees_department_join = employees.join(departments).collect() |
| 165 | +print("join example (employee-department join):", employees_department_join) |
| 166 | + |
| 167 | +# COMMAND ---------- |
| 168 | + |
| 169 | +# MAGIC %md |
| 170 | +# MAGIC ## `cogroup` Function |
| 171 | + |
| 172 | +# COMMAND ---------- |
| 173 | + |
| 174 | +# MAGIC %md |
| 175 | +# MAGIC TableA: |
| 176 | +# MAGIC |
| 177 | +# MAGIC | id | value | |
| 178 | +# MAGIC |----|--------| |
| 179 | +# MAGIC | 1 | apple | |
| 180 | +# MAGIC | 2 | banana | |
| 181 | +# MAGIC | 3 | orange | |
| 182 | +# MAGIC |
| 183 | +# MAGIC |
| 184 | +# MAGIC TableB: |
| 185 | +# MAGIC |
| 186 | +# MAGIC | id | color | |
| 187 | +# MAGIC |----|--------| |
| 188 | +# MAGIC | 1 | red | |
| 189 | +# MAGIC | 2 | yellow | |
| 190 | +# MAGIC |
| 191 | +# MAGIC |
| 192 | +# MAGIC Result of cogroup: |
| 193 | +# MAGIC |
| 194 | +# MAGIC | id | value | color | |
| 195 | +# MAGIC |----|--------|--------| |
| 196 | +# MAGIC | 1 | apple | red | |
| 197 | +# MAGIC | 2 | banana | yellow | |
| 198 | +# MAGIC | 3 | orange | NULL | |
| 199 | +# MAGIC |
| 200 | +# MAGIC |
| 201 | + |
| 202 | +# COMMAND ---------- |
| 203 | + |
| 204 | +# 8. cogroup |
| 205 | +# The cogroup function in PySpark is used to group data from two RDDs that share the same key. |
| 206 | +# It combines the values of matching keys from both RDDs into a tuple of lists. |
| 207 | +print("\n### 8. cogroup ###") |
| 208 | +print("Description: Group data from two RDDs sharing the same key.") |
| 209 | + |
| 210 | +# 01 Example: Cogroup two RDDs |
| 211 | +fruits_rdd = sc.parallelize([(1, "apple"), (2, "banana"), (3, "orange")]) |
| 212 | +colors_rdd = sc.parallelize([(1, "red"), (2, "yellow")]) |
| 213 | +cogrouped_fruits_colors = fruits_rdd.cogroup(colors_rdd).mapValues(lambda x: (list(x[0]), list(x[1]))).collect() |
| 214 | +print("01 cogroup example (group two RDDs):", cogrouped_fruits_colors) |
| 215 | + |
| 216 | + |
| 217 | + |
| 218 | +# example_Example: Cogroup sales data with target data |
| 219 | +sales_rdd = sc.parallelize([("store1", 100), ("store2", 200)]) |
| 220 | +targets_rdd = sc.parallelize([("store1", 150), ("store3", 250)]) |
| 221 | +cogrouped_sales_targets = sales_rdd.cogroup(targets_rdd).mapValues(lambda x: (list(x[0]), list(x[1]))).collect() |
| 222 | +print("example_cogroup example (sales-targets cogroup):", cogrouped_sales_targets) |
| 223 | + |
| 224 | + |
| 225 | +# COMMAND ---------- |
| 226 | + |
| 227 | +# MAGIC %md |
| 228 | +# MAGIC ## `distinct` Function |
| 229 | + |
| 230 | +# COMMAND ---------- |
| 231 | + |
| 232 | +# 9. distinct |
| 233 | +print("\n### 9. distinct ###") |
| 234 | +print("Description: Return a new RDD containing the distinct elements in this RDD.") |
| 235 | + |
| 236 | +# example_Example: Unique words from a list of words |
| 237 | +words = ["cat", "dog", "cat", "elephant", "dog"] |
| 238 | +words_rdd = sc.parallelize(words) |
| 239 | +example__distinct = words_rdd.distinct().collect() |
| 240 | +print("example_distinct example (unique words):", example__distinct) |
0 commit comments