Skip to content

Latest commit

 

History

History
201 lines (164 loc) · 4.24 KB

Data Science.md

File metadata and controls

201 lines (164 loc) · 4.24 KB

SQL for Data Science

Data Science operations using SQL

  • Linking data from different data stores ( Data Warehouse, Data Lakes, Databases )
  • Filtering, cleaning and reformatting data for different uses ( EDA, Analysis, Dashboards )
  • Aggregating data to provide big-picture summaries.
  • Answering specific questions about business operations ( Current year profit, Average sale, Trends )

Source of Data

  1. Relational and NoSQL (Non Relational) Databases.
  2. Data from mobile applications, IoT devices, web logs and automated systems.
  3. Manually managed data for training.

ETL: Extract Transform Load

  • Extract: Read the data from various data sources.
  • Transform: Clean, preprocess and reshape the data (Trim whitespace, reformat date, standardize value, convert data type)
  • Load: Write, push or dump into a data warehouse or database.

Queries

1. Counting Rows and Items 2. Aggregation Functions 3. Extreme Value Identification 4. Slicing Data 5. Sorting Data 6. Filter Patterns 7. Group By Filtering

Table

ID first_name last_name age gender city birthday
1 Kirankumar Yadav 25 M Thane 1996-02-07
2 Paramveer Yadav 26 M Kalyan 1995-01-21
3 Gaurav Sonar 26 M Kalyan 1995-03-21
4 Pranit Sorte 28 M Ambernath 1993-06-21

Create Table

CREATE TABLE Employee(
ID INT IDENTITY,
first_name VARCHAR(25) NOT NULL,
last_name VARCHAR(25),
age INT CHECK (Age>=18),
gender VARCHAR(1),
city VARCHAR(25) 
)

INSERT: Insert Rows

INSERT INTO employee(first_name, last_name, age, gender, city)
VALUES('Kirankumar', 'Yadav', 25, 'M', 'Thane')

ALTER: Add New Column to Table: DATE (YYYY-MM-DD)

ALTER TABLE employee
ADD birthday DATE 

UPDATE: Update Table Value

UPDATE employee
SET birthday='1996-02-07' WHERE ID=1

SELECT

SELECT * FROM employee 

INSERT: Insert Multiple Rows

INSERT INTO employee(first_name, last_name, age, gender, birthday)
VALUES('Paramveer', 'Yadav', 28, 'M', 'Kalyan', '1995-01-21'),
      ('Gaurav', 'Sonar', 28, 'M', 'Kalyan', '1995-03-21'),
      ('Pranit', 'Sorte', 30, 'M', 'Ambernath', '1993-06-21')

DELETE

DELETE FROM employee
WHERE ID=5

DISTINCT: Unique Values in a Column

SELECT DISTINCT(age)
FROM employee

COUNT: Number of Rows or Items

SELECT COUNT(ID)
FROM employee

Aggregate Functions

SUM and AVG: Sum and Average of Numerical Values

SELECT
SUM(age) AS total_age,
AVG(age) AS average_age
FROM employee

Extreme Value Identification

MAX and MIN: Maximum and Minimum Numeric Value

SELECT
MAX(Age) AS max_age,
MIN(Age) AS min_age
FROM employee

Slicing Data

SELECT * FROM employee
WHERE city='Kalyan'

Sorting Data

SELECT * FROM employee
ORDER BY age DESC

SELECT * FROM employee
ORDER BY first_name -- Alphabetical Order

Filter Patterns

SELECT * FROM employee
WHERE first_name LIKE 'K%' -- Starting with K

SELECT * FROM employee
WHERE first_name LIKE '%R' -- Ending with R

SELECT * FROM employee
WHERE first_name LIKE '%an%' -- Contains an

Group By Filtering (Aggregation + Having)

SELECT SUM(age) AS age, city 
FROM employee 
GROUP BY city

SELECT SUM(age) AS age, last_name 
FROM employee 
GROUP BY last_name
SELECT SUM(age) AS age, city 
FROM employee 
GROUP BY city 
HAVING SUM(age) > 30

OFFSET

SELECT first_name, last_name
FROM employee
ORDER BY city
OFFSET 2 ROWS

TOP

SELECT TOP 2 *
FROM employee

BETWEEN

SELECT * FROM employee
WHERE age BETWEEN 26 AND 28

IN

SELECT * FROM employee
WHERE age IN (26,28)

IS NOT NULL

SELECT * FROM employee
WHERE age IS NOT NULL

DROP Table

DROP TABLE employee