Skip to content

Commit 3b3600f

Browse files
ADD: markdowns
1 parent 7faee41 commit 3b3600f

15 files changed

+773
-47
lines changed

Code/20-spark-operations/20-spark-operations.ipynb

+18-10
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,13 @@
3232
},
3333
"outputs": [
3434
{
35-
"output_type": "stream",
3635
"name": "stdout",
3736
"output_type": "stream",
3837
"text": [
39-
"Original RDD ID: 227\nOriginal RDD ID: 227\nTransformed RDD ID: 228\nTransformed RDD result: [2, 4, 6, 8, 10]\n"
38+
"Original RDD ID: 227\n",
39+
"Original RDD ID: 227\n",
40+
"Transformed RDD ID: 228\n",
41+
"Transformed RDD result: [2, 4, 6, 8, 10]\n"
4042
]
4143
}
4244
],
@@ -73,7 +75,6 @@
7375
},
7476
"outputs": [
7577
{
76-
"output_type": "display_data",
7778
"data": {
7879
"text/html": [
7980
"<style scoped>\n",
@@ -142,7 +143,6 @@
142143
},
143144
"outputs": [
144145
{
145-
"output_type": "display_data",
146146
"data": {
147147
"text/html": [
148148
"<style scoped>\n",
@@ -228,11 +228,14 @@
228228
},
229229
"outputs": [
230230
{
231-
"output_type": "stream",
232231
"name": "stdout",
233232
"output_type": "stream",
234233
"text": [
235-
"Original RDD:\n('John', 28)\n('Smith', 44)\n('Adam', 65)\n('Henry', 23)\n"
234+
"Original RDD:\n",
235+
"('John', 28)\n",
236+
"('Smith', 44)\n",
237+
"('Adam', 65)\n",
238+
"('Henry', 23)\n"
236239
]
237240
}
238241
],
@@ -264,11 +267,15 @@
264267
},
265268
"outputs": [
266269
{
267-
"output_type": "stream",
268270
"name": "stdout",
269271
"output_type": "stream",
270272
"text": [
271-
"Original RDD ID: 316\nOriginal RDD ID After filter: 317\nTransformed RDD ID: 318\nFiltered RDD:\n('Smith', 44)\n('Adam', 65)\n"
273+
"Original RDD ID: 316\n",
274+
"Original RDD ID After filter: 317\n",
275+
"Transformed RDD ID: 318\n",
276+
"Filtered RDD:\n",
277+
"('Smith', 44)\n",
278+
"('Adam', 65)\n"
272279
]
273280
}
274281
],
@@ -307,7 +314,6 @@
307314
},
308315
"outputs": [
309316
{
310-
"output_type": "display_data",
311317
"data": {
312318
"text/html": [
313319
"<style scoped>\n",
@@ -406,7 +412,6 @@
406412
},
407413
"outputs": [
408414
{
409-
"output_type": "display_data",
410415
"data": {
411416
"text/html": [
412417
"<style scoped>\n",
@@ -518,6 +523,9 @@
518523
},
519524
"notebookName": "20-spark-operations",
520525
"widgets": {}
526+
},
527+
"jupytext": {
528+
"formats": "ipynb,md"
521529
}
522530
},
523531
"nbformat": 4,
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
---
2+
jupyter:
3+
jupytext:
4+
formats: ipynb,md
5+
main_language: python
6+
text_representation:
7+
extension: .md
8+
format_name: markdown
9+
format_version: '1.3'
10+
jupytext_version: 1.16.2
11+
---
12+
13+
# Immutable RDDs
14+
15+
```python
16+
# Test Immutable RDDs
17+
numbers = [1, 2, 3, 4, 5]
18+
numbers_rdd = sc.parallelize(numbers)
19+
print(f"Original RDD ID: {numbers_rdd.id()}")
20+
print(f"Original RDD ID: {numbers_rdd.id()}")
21+
22+
# # Apply a transformation: multiply each number by 2
23+
transformed_rdd = numbers_rdd.map(lambda x: x * 2)
24+
print(f"Transformed RDD ID: {transformed_rdd.id()}")
25+
26+
# # Collect the results to trigger the computation
27+
result = transformed_rdd.collect()
28+
print(f"Transformed RDD result: {result}")
29+
30+
```
31+
32+
```python
33+
%scala
34+
// Test Immutable RDDs
35+
val numbers = List(1, 2, 3, 4, 5)
36+
val numbersRdd = sc.parallelize(numbers)
37+
println(s"Original RDD ID: ${numbersRdd.id}")
38+
println(s"Original RDD ID: ${numbersRdd.id}")
39+
println(s"Original RDD ID: ${numbersRdd.id}")
40+
41+
42+
```
43+
44+
```python
45+
%scala
46+
47+
// numbersRdd = numbersRdd.map(x => x * 2) //OPS!!!!!!!!!!!
48+
49+
// Apply a transformation: multiply each number by 2
50+
val transformedRdd = numbersRdd.map(x => x * 2)
51+
println(s"Transformed RDD ID: ${transformedRdd.id}")
52+
53+
// Collect the results to trigger the computation
54+
val result = transformedRdd.collect()
55+
println(s"Transformed RDD result: ${result.mkString(", ")}")
56+
```
57+
58+
# Immutable DF Example
59+
60+
```python
61+
# Create an RDD
62+
data = [("John", 28), ("Smith", 44), ("Adam", 65), ("Henry", 23)]
63+
rdd = sc.parallelize(data)
64+
65+
# Show the original RDD
66+
print("Original RDD:")
67+
for row in rdd.collect():
68+
print(row)
69+
70+
```
71+
72+
```python
73+
74+
print(f"Original RDD ID: {rdd.id()}")
75+
76+
rdd = rdd.filter(lambda x: x[1] > 30)
77+
78+
print(f"Original RDD ID After filter: {rdd.id()}")
79+
80+
# Filter rows where the age is greater than 30
81+
filtered_rdd = rdd.filter(lambda x: x[1] > 30)
82+
print(f"Transformed RDD ID: {filtered_rdd.id()}")
83+
84+
# Show the transformed RDD
85+
print("Filtered RDD:")
86+
for row in filtered_rdd.collect():
87+
print(row)
88+
```
89+
90+
```python
91+
%scala
92+
// Create an RDD
93+
val data = Seq(("John", 28), ("Smith", 44), ("Adam", 65), ("Henry", 23))
94+
val rdd = sc.parallelize(data)
95+
96+
// Show the original RDD
97+
println("Original RDD:")
98+
rdd.collect().foreach(println)
99+
//rdd = rdd.filter{ case (name, age) => age > 30 }
100+
// // Filter rows where the age is greater than 30
101+
val filteredRdd = rdd.filter{ case (name, age) => age > 30 }
102+
println(s"Transformed RDD ID: ${filteredRdd.id}")
103+
104+
// Show the transformed RDD
105+
println("Filtered RDD:")
106+
filteredRdd.collect().foreach(println)
107+
```
108+
109+
# Spark Lazy Evaluation
110+
111+
```python
112+
# Create an RDD
113+
rdd = sc.parallelize([
114+
("John", 28),
115+
("Smith", 44),
116+
("Adam", 65),
117+
("Henry", 23)
118+
])
119+
120+
# Apply a map transformation to create a new RDD with a tuple including the name and a boolean flag
121+
# if the person is older than 30
122+
mapped_rdd = rdd.map(lambda x: (x[0], x[1], x[1] > 30))
123+
124+
# Filter the RDD to include only people older than 30
125+
filtered_rdd = mapped_rdd.filter(lambda x: x[2])
126+
127+
# Convert the filtered RDD back to a DataFrame
128+
df = spark.createDataFrame(filtered_rdd, ["Name", "Age", "OlderThan30"])
129+
130+
# Select only the name and age columns
131+
final_df = df.select("Name", "Age")
132+
133+
# # Collect the results which triggers the execution of all transformations
134+
results = final_df.collect()
135+
display(results)
136+
137+
```

Code/22-spark-immutability/22-spark-immutablility-example.ipynb

+18-11
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,13 @@
3232
},
3333
"outputs": [
3434
{
35-
"output_type": "stream",
3635
"name": "stdout",
3736
"output_type": "stream",
3837
"text": [
39-
"Original RDD ID: 227\nOriginal RDD ID: 227\nTransformed RDD ID: 228\nTransformed RDD result: [2, 4, 6, 8, 10]\n"
38+
"Original RDD ID: 227\n",
39+
"Original RDD ID: 227\n",
40+
"Transformed RDD ID: 228\n",
41+
"Transformed RDD result: [2, 4, 6, 8, 10]\n"
4042
]
4143
}
4244
],
@@ -73,7 +75,6 @@
7375
},
7476
"outputs": [
7577
{
76-
"output_type": "display_data",
7778
"data": {
7879
"text/html": [
7980
"<style scoped>\n",
@@ -142,7 +143,6 @@
142143
},
143144
"outputs": [
144145
{
145-
"output_type": "display_data",
146146
"data": {
147147
"text/html": [
148148
"<style scoped>\n",
@@ -213,7 +213,6 @@
213213
},
214214
"outputs": [
215215
{
216-
"output_type": "display_data",
217216
"data": {
218217
"text/html": [
219218
"<style scoped>\n",
@@ -292,11 +291,14 @@
292291
},
293292
"outputs": [
294293
{
295-
"output_type": "stream",
296294
"name": "stdout",
297295
"output_type": "stream",
298296
"text": [
299-
"Original RDD:\n('John', 28)\n('Smith', 44)\n('Adam', 65)\n('Henry', 23)\n"
297+
"Original RDD:\n",
298+
"('John', 28)\n",
299+
"('Smith', 44)\n",
300+
"('Adam', 65)\n",
301+
"('Henry', 23)\n"
300302
]
301303
}
302304
],
@@ -328,11 +330,15 @@
328330
},
329331
"outputs": [
330332
{
331-
"output_type": "stream",
332333
"name": "stdout",
333334
"output_type": "stream",
334335
"text": [
335-
"Original RDD ID: 316\nOriginal RDD ID After filter: 317\nTransformed RDD ID: 318\nFiltered RDD:\n('Smith', 44)\n('Adam', 65)\n"
336+
"Original RDD ID: 316\n",
337+
"Original RDD ID After filter: 317\n",
338+
"Transformed RDD ID: 318\n",
339+
"Filtered RDD:\n",
340+
"('Smith', 44)\n",
341+
"('Adam', 65)\n"
336342
]
337343
}
338344
],
@@ -371,7 +377,6 @@
371377
},
372378
"outputs": [
373379
{
374-
"output_type": "display_data",
375380
"data": {
376381
"text/html": [
377382
"<style scoped>\n",
@@ -470,7 +475,6 @@
470475
},
471476
"outputs": [
472477
{
473-
"output_type": "display_data",
474478
"data": {
475479
"text/html": [
476480
"<style scoped>\n",
@@ -582,6 +586,9 @@
582586
},
583587
"notebookName": "22-spark-immutablility-example",
584588
"widgets": {}
589+
},
590+
"jupytext": {
591+
"formats": "ipynb,md"
585592
}
586593
},
587594
"nbformat": 4,

0 commit comments

Comments
 (0)