-
Notifications
You must be signed in to change notification settings - Fork 1
/
SQL code
134 lines (85 loc) · 4.45 KB
/
SQL code
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
-- Data Description
/**
In order to have a better data description we usually check the Shape and Size of our dataset along with the general description of datasets such as count, unique values etc.
**/
-- Check the Size of dataset
SELECT count(column_name)
FROM caba-demo.stdnt.INFORMATION_SCHEMA.COLUMNS
WHERE table_name = 'stnd';
-- This showed we had 9 columns
select count(gender) as row_count from `caba-demo.stdnt.stnd`;
-- 1000 Rows in the dataset
-- Check the column types
SELECT column_name, data_type
FROM caba-demo.stdnt.INFORMATION_SCHEMA.COLUMNS
WHERE table_name = 'stnd';
-- Check the metadata / Description of dataset
SELECT table_name, column_name, is_nullable, data_type, is_partitioning_column
FROM `caba-demo.stdnt`.INFORMATION_SCHEMA.COLUMNS;
-- Check the subset of observations recorded in the dataset
select * from `caba-demo.stdnt.stnd` limit 10;
-- Data Aggregation
/**
It helps us understand the data trends and values based on the compact display of values
**/
-- check the unique values alongwith the count of values for categorical columns
SELECT count(gender) as gender_counter, gender FROM
`caba-demo.stdnt.stnd` group by gender;
SELECT count(race_ethnicity) as race_counter, race_ethnicity FROM
`caba-demo.stdnt.stnd` group by race_ethnicity;
SELECT count(parental_level_education) as parental_level_education_counter, parental_level_education FROM
`caba-demo.stdnt.stnd` group by parental_level_education;
SELECT count(lunch) as lunch_counter, lunch FROM
`caba-demo.stdnt.stnd` group by lunch;
-- check the min and max of numerical columns
SELECT min(math_score) as math_score_min, max(math_score) as math_score_max,
min(reading_score) as reading_score_min, max(reading_score) as reading_score_max,
min(writing_score) as writing_score_min, max(writing_score) as writing_score_max,
min(obtained_score) as obtained_score_min, max(obtained_score) as obtained_score_max
FROM `caba-demo.stdnt.stnd`;
-- Filter values
-- Filter values for group A
select * from `caba-demo.stdnt.stnd`
where race_ethnicity = "group A";
-- Filter values for Bachelor's and Master's Parental level of education
select * from `caba-demo.stdnt.stnd`
where parental_level_education in ("master's degree", "bachelor's degree");
-- Sort Values
select * from `caba-demo.stdnt.stnd` order by obtained_score desc;
-- Data Summary
/**
Summaries of data help us understand the detailed trends followed in datasets based on concise information using measures of location and spread
**/
-- Summarize by mean values
select test_preparation, avg(math_score) as Mean_Mathscore, avg(reading_score) as Mean_Readingscore, avg(writing_score) as Mean_Writingscore,
avg(obtained_score) as Mean_Obtainedscore from `caba-demo.stdnt.stnd` group by test_preparation;
-- summarize by median
select PERCENTILE_CONT(obtained_score,0.5) OVER() AS median_obtainedscore , PERCENTILE_CONT(writing_score,0.5) OVER() AS median_writingscore,
PERCENTILE_CONT(math_score,0.5) OVER() AS median_mathscore, PERCENTILE_CONT(reading_score,0.5) OVER() AS median_readingscore
from `caba-demo.stdnt.stnd` limit 1;
--summarize by mode
SELECT
APPROX_TOP_COUNT(obtained_score,1) AS mode_obtainedscore, APPROX_TOP_COUNT(writing_score,1) AS mode_writingscore,
APPROX_TOP_COUNT(math_score,1) AS mode_mathscore, APPROX_TOP_COUNT(reading_score,1) AS mode_readingscore
from `caba-demo.stdnt.stnd` limit 1;
-- stddev
SELECT
STDDEV(obtained_score) AS standard_deviation
FROM `caba-demo.stdnt.stnd`;
-- creating temp tables with grouped information
drop table if exists `caba-demo.stdnt.groupbyparental`;
Create table `caba-demo.stdnt.groupbyparental`
as
(SELECT avg(obtained_score) as Obtained_score, parental_level_education FROM
`caba-demo.stdnt.stnd` group by parental_level_education);
select * from `caba-demo.stdnt.groupbyparental` order by Obtained_score;
-- Making quartiles by Obtained Score
drop table if exists `caba-demo.stdnt.IQR`;
Create table `caba-demo.stdnt.IQR` as (
select PERCENTILE_CONT(obtained_score,0.25) OVER() AS q1_obtainedscore , PERCENTILE_CONT(obtained_score,0.75) OVER() AS q3_obtainedscore,
(PERCENTILE_CONT(obtained_score,0.75) OVER()) - (PERCENTILE_CONT(obtained_score,0.25) OVER()) as IQR
from `caba-demo.stdnt.stnd` limit 1);
select (q1_obtainedscore - (1.5* IQR)) as maximum, (q3_obtainedscore + (1.5* IQR)) as minimum from
`caba-demo.stdnt.IQR`;
-- identify the outliers
select * from `caba-demo.stdnt.stnd` where obtained_score > 323.8 or obtained_score < 86.8;