-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathProcess.sql
More file actions
230 lines (168 loc) · 7.74 KB
/
Process.sql
File metadata and controls
230 lines (168 loc) · 7.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
USE [cyclistic_case_study];
GO;
--CREATING A NEW TABLE--------------------------------
--DROP TABLE IF EXISTS cyclistic_case_study.dbo.trip_data;
CREATE TABLE cyclistic_case_study.dbo.trip_data
(ride_id VARCHAR(255)
,rideable_type VARCHAR(255)
,started_at DATETIME2
,ended_at DATETIME2
,start_station_name VARCHAR(255)
,start_station_id VARCHAR(255)
,end_station_name VARCHAR(255)
,end_station_id VARCHAR(255)
,start_lat FLOAT
,start_lng FLOAT
,end_lat FLOAT
,end_lng FLOAT
,member_casual VARCHAR(255) )
--Altering the datatype of some of the columns to merge all the tables into a single table--------------------------------------
ALTER TABLE cyclistic_case_study.dbo.june_21 ALTER COLUMN start_station_id VARCHAR(255);
ALTER TABLE cyclistic_case_study.dbo.june_21 ALTER COLUMN end_station_id VARCHAR(255);
ALTER TABLE cyclistic_case_study.dbo.july_21 ALTER COLUMN start_station_id VARCHAR(255);
ALTER TABLE cyclistic_case_study.dbo.july_21 ALTER COLUMN end_station_id VARCHAR(255);
ALTER TABLE cyclistic_case_study.dbo.june_21 ALTER COLUMN start_station_id VARCHAR(255);
ALTER TABLE cyclistic_case_study.dbo.november_21 ALTER COLUMN start_station_id VARCHAR(255);
ALTER TABLE cyclistic_case_study.dbo.april_22 ALTER COLUMN start_station_id VARCHAR(255);
--INSERTING DATA INTO NEW TABLE FROM EXSISTING TABLES------------------------------
INSERT INTO cyclistic_case_study.dbo.trip_data (ride_id, rideable_type, started_at, ended_at,
start_station_name, start_station_id, end_station_name, end_station_id, start_lat,
start_lng, end_lat, end_lng, member_casual)
(SELECT * FROM june_21
UNION
SELECT * FROM july_21
UNION
SELECT * FROM august_2021
UNION
SELECT * FROM september_21
UNION
SELECT * FROM october_21
UNION
SELECT * FROM november_21
UNION
SELECT * FROM december_21
UNION
SELECT * FROM january_22
UNION
SELECT * FROM february_22
UNION
SELECT * FROM march_22
UNION
SELECT * FROM april_22
UNION
SELECT * FROM may_22
)
SELECT COUNT(*) FROM cyclistic_case_study.dbo.trip_data
-- CLEANING -------------------------------------------------------
-- Deleting rows with NULL values ---------------------------------
SELECT count(*)
FROM cyclistic_case_study.dbo.trip_data
WHERE ride_id IS NULL
OR rideable_type IS NULL
OR started_at is NULL
OR ended_at is NULL
OR start_station_name is NULL
OR start_station_id is NULL
OR end_station_name is NULL
OR end_station_id is NULL
OR member_casual is NULL;
DELETE
FROM cyclistic_case_study.dbo.trip_data
WHERE ride_id IS NULL
OR rideable_type IS NULL
OR started_at is NULL
OR ended_at is NULL
OR start_station_name is NULL
OR start_station_id is NULL
OR end_station_name is NULL
OR end_station_id is NULL
OR member_casual is NULL;
-- Checking for anomalous entry-------------------------------------
SELECT DISTINCT start_station_name
FROM cyclistic_case_study.dbo.trip_data;
SELECT DISTINCT end_station_name
FROM cyclistic_case_study.dbo.trip_data;
-- Checking for misspellings---------------------------------
SELECT DISTINCT rideable_type, member_casual
FROM cyclistic_case_study.dbo.trip_data;
--Deleting duplicates ----------------------------------------
SELECT ride_id, COUNT(ride_id)
FROM cyclistic_case_study.dbo.trip_data
GROUP BY ride_id
HAVING COUNT(ride_id) > 1;
------------Deletes the duplicates and the originals
--DELETE
--FROM cyclistic_case_study.dbo.trip_data
--WHERE ride_id IN ( SELECT ride_id
-- FROM cyclistic_case_study.dbo.trip_data
-- GROUP BY ride_id
-- HAVING COUNT(ride_id) > 1);
------------Below method deletes only the duplicates
WITH ROWNUMCTE AS (
SELECT *, ROW_NUMBER() OVER(PARTITION BY ride_id ORDER BY ride_id) AS row_num
FROM cyclistic_case_study.dbo.trip_data
)
DELETE FROM ROWNUMCTE
WHERE row_num > 1
-- Removing rides that have more than 24hrs ride_length ----------------------------------------------------------
SELECT AVG(ride_length) AS avg_ride_time, MIN(ride_length) AS min_ride_time, MAX(ride_length) AS max_ride_time
FROM cyclistic_case_study.dbo.trip_data;
-------Upon reviewing the aboves query's result it is found that there are rides that are non-stop for more than 24hrs/1440mins.
-------Bikes that are not returned to the station within 24hrs are considered stolen or lost.
-------Hence the rides that are longer than 24hrs are removed from trip_data table but stored into stolen_lost_bikes table for any future use.
SELECT ride_id,rideable_type,ride_length, started_at, ended_at, start_station_name, end_station_name, member_casual
FROM cyclistic_case_study.dbo.trip_data
WHERE ride_length >= 1440;
CREATE TABLE cyclistic_case_study.dbo.stolen_lost_bikes
(ride_id VARCHAR(255)
,rideable_type VARCHAR(255)
,started_at DATETIME2
,ended_at DATETIME2
,start_station_name VARCHAR(255)
,start_station_id VARCHAR(255)
,end_station_name VARCHAR(255)
,end_station_id VARCHAR(255)
,start_lat FLOAT
,start_lng FLOAT
,end_lat FLOAT
,end_lng FLOAT
,member_casual VARCHAR(255)
,ride_length int
,day_of_week VARCHAR(15)
,started_at_time TIME(7)
,ended_at_time TIME(7) )
INSERT INTO cyclistic_case_study.dbo.stolen_lost_bikes(ride_id, rideable_type, started_at, ended_at,
start_station_name, start_station_id, end_station_name, end_station_id, start_lat, start_lng,
end_lat, end_lng, member_casual, ride_length, day_of_week, started_at_time, ended_at_time)
(SELECT * FROM cyclistic_case_study.dbo.trip_data WHERE ride_length > 1440)
SELECT * FROM cyclistic_case_study.dbo.stolen_lost_bikes;
DELETE
FROM cyclistic_case_study.dbo.trip_data
WHERE ride_length>1440
-- Delete rows with ride time less than 1 minute-----------------------
SELECT ride_length, started_at, ended_at
FROM cyclistic_case_study.dbo.trip_data
WHERE ride_length<=1;
DELETE
FROM cyclistic_case_study.dbo.trip_data
WHERE ride_length<=1;
-- TRIMing unnecessary white spaces-----------------------------------------------------------
UPDATE cyclistic_case_study.dbo.trip_data SET start_station_name = TRIM(start_station_name);
UPDATE cyclistic_case_study.dbo.trip_data SET end_station_name = TRIM(end_station_name);
SELECT start_station_name, end_station_name FROM cyclistic_case_study.dbo.trip_data;
-- MANIPULATION --------------------------------------------------------
-- Create column: 'started_at_time', 'ended_at_time' ----------------------------
ALTER TABLE cyclistic_case_study.dbo.trip_data ADD started_at_time TIME;
ALTER TABLE cyclistic_case_study.dbo.trip_data ADD ended_at_time TIME;
UPDATE cyclistic_case_study.dbo.trip_data SET started_at_time = CONVERT(TIME(0), started_at);
UPDATE cyclistic_case_study.dbo.trip_data SET ended_at_time = CONVERT(TIME(0), ended_at);
SELECT started_at, started_at_time FROM cyclistic_case_study.dbo.trip_data;
SELECT ended_at, ended_at_time FROM cyclistic_case_study.dbo.trip_data;
-- create column: 'ride_length'-----------------------------------------------------
ALTER TABLE cyclistic_case_study.dbo.trip_data ADD ride_length int;
UPDATE cyclistic_case_study.dbo.trip_data SET ride_length = DATEDIFF(MINUTE, started_at, ended_at ); ---ride_length_min would be an appropriate colname
SELECT started_at, ended_at, ride_length FROM cyclistic_case_study.dbo.trip_data;
-- Create Column: 'day_of_week'-----------------------------------------------------------
ALTER TABLE cyclistic_case_study.dbo.trip_data ADD day_of_week VARCHAR(15);
UPDATE cyclistic_case_study.dbo.trip_data SET day_of_week = DATENAME(WEEKDAY, started_at)
SELECT day_of_week FROM cyclistic_case_study.dbo.trip_data