Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
efabe21
new section added on sampling for eda
alexsnowdon Aug 27, 2025
866121d
adding sampling-for-eda section to toc.yml
alexsnowdon Aug 27, 2025
95f9543
updating convert.py with correct details
alexsnowdon Aug 27, 2025
887c036
updaying file paths for data
alexsnowdon Aug 27, 2025
baf5854
adding link and editing written bit under: taking the sample section
alexsnowdon Aug 28, 2025
6302ecb
edited content on stopping spark session
alexsnowdon Aug 28, 2025
64ea917
adding the new sampling-for-eda.md and the .ipynb and .r file from re…
alexsnowdon Sep 18, 2025
61c0990
deleting the old sampling_for_eda.ipynb asconvert.py wouldnt work
alexsnowdon Sep 18, 2025
08ce96d
sampling for eda data
alexsnowdon Sep 18, 2025
f7af615
added more code and written content to sampling-for-eda.md and used r…
alexsnowdon Sep 18, 2025
b6ab2f7
written and coding content added to the sampling-for-eda.md ready for…
alexsnowdon Sep 22, 2025
6443b2f
added pyspark info to dropping duplicates section
alexsnowdon Sep 22, 2025
7e4be1a
formatting mot_eda_sample.head()
alexsnowdon Sep 22, 2025
80451c1
corrected python output formatting and reverse converted the .md file
alexsnowdon Sep 22, 2025
5c69283
updating tabs to correct page formatting
alexsnowdon Sep 22, 2025
9ec63ea
updated test_date to datetime formatting in python sample dataframe
alexsnowdon Sep 24, 2025
bdbd9fb
updating sdf_describe bit for big data
alexsnowdon Sep 24, 2025
ea4c407
adding summary code for python sample
alexsnowdon Sep 24, 2025
3f75bba
images added to sampling-for-eda page
alexsnowdon Sep 24, 2025
c94f19c
adding images to sampling for eda.md tabs
alexsnowdon Sep 24, 2025
7d66fa5
editing image tabs
alexsnowdon Sep 24, 2025
1e33487
formatting image tabs again
alexsnowdon Sep 24, 2025
859ed95
changing image format and adding further resources
alexsnowdon Sep 24, 2025
445b534
adding reverse_convert output files
alexsnowdon Sep 24, 2025
607bd24
trying to add figures to tabs in sampling-for-eda.md
alexsnowdon Sep 24, 2025
32e0f74
figure sections updates for sampling-for-eda.md
alexsnowdon Sep 24, 2025
f6985e2
reverse_convert output updated
alexsnowdon Sep 24, 2025
a5ee753
editing R cell for correlation data on sample
alexsnowdon Sep 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ons-spark/_toc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ parts:
- caption: Analysis in Spark
chapters:
- file: spark-analysis/big-data-workflow
- file: spark-analysis/sampling-for-eda
- file: spark-analysis/interpolation
- file: spark-analysis/logistic-regression
- file: spark-analysis/visualisation
Expand Down
37,661 changes: 37,661 additions & 0 deletions ons-spark/data/mot_eda_sample_0.1.csv

Large diffs are not rendered by default.

Binary file added ons-spark/images/mean_test_mileage_colour_R.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
251 changes: 251 additions & 0 deletions ons-spark/raw-notebooks/sampling-for-eda/r_input.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@


# Load in required packages
library(sparklyr)
library(dplyr)
library(pillar)

# Set up Spark session
default_config <- sparklyr::spark_config()

sc <- spark_connect(master = "local",
app_name = "sampling_for_eda",
config = default_config)


# Check Spark connection is open
spark_connection_is_open(sc)




# Read in the MOT dataset
#mot_path = "s3a://onscdp-dev-data01-5320d6ca/bat/dapcats/mot_test_results.csv"
mot_path = "C:/Users/snowda/repos/data/mot_test_results.csv"
mot <- sparklyr::spark_read_csv(
sc,
mot_path,
header = TRUE,
infer_schema = TRUE)

# Check schema and preview data
pillar::glimpse(mot)




# Check the size of the data we are working with
mot %>%
sparklyr::sdf_nrow() %>%
print()




# Select columns we want to work with
mot <- mot %>%
sparklyr::select(vehicle_id, test_date, test_mileage, postcode_area, make, colour, cylinder_capacity)

# Change column data type or column name formatting. For example, change test_date to a date instead of an integer.
mot %>%
dplyr::mutate(test_date = as.date(test_date))

# Re-check the schema to ensure your changes have been made to the mot dataframe
pillar::glimpse(mot)




# Check for missing data first, do not just omit it. The example uses the test_mileage column.

mot %>%
filter(is.na(test_mileage)) %>%
sdf_nrow() %>%
print()




# If appropriate for your data, remove any rows with missing data under ANY variable.
mot <- mot %>%
na.omit()

mot %>% sdf_nrow()



# Identify duplicate data in the mot dataframe
duplicates <- mot %>%
group_by(vehicle_id, test_date, test_mileage, postcode_area, make, colour, cylinder_capacity) %>%
summarise(count = n()) %>%
filter(count > 1)
arrange(desc(count))

duplicates %>%
sdf_nrow() %>%
print()




# If appropriate for your data, remove the duplicated rows and preview your clean dataset
(i.e. removal of duplicates and missing values).

mot_clean <- sdf_distinct(mot)

mot_clean_size <- mot_clean %>%
sparklyr::sdf_nrow()

mot_clean_size %>% print()

pillar::glimpse(mot_clean)




# It can also be useful to view distinct groups in the categorical columns across your data frame, for example showing the distinct groups in the 'colour' column will show you all the different colours of cars reported in the dataframe. This could also help you spot anomalies or errors.
colour <- mot_clean %>%
sparklyr::select(colour) %>%
sparklyr::sdf_distinct() %>%
sparklyr::sdf_collect()

colour




colour
# A tibble: 20 � 1
colour
<chr>
1 RED
2 CREAM
3 GOLD
4 BROWN
5 YELLOW
6 BRONZE
7 BLACK
8 MULTI-COLOUR
9 NOT STATED
10 SILVER
11 PINK
12 PURPLE
13 BLUE
14 GREY
15 BEIGE
16 MAROON
17 TURQUOISE
18 WHITE
19 ORANGE
20 GREEN




corr_matrix <- mot_distinct %>%
ml_corr(c("vehicle_id", "test_mileage", "cylinder_capacity"))
corr_matrix




# Summary statistics for a numeric and string column
summary_mileage <- sdf_describe(mot_clean, "test_mileage")
summary_mileage



sample_size_finite <- function(N, z, p, e) {
#' Calculates a suggested sample size for a finite population.
#'
#' @param N = population size
#' @param z = z score (e.g. 1.96 = 95% confidence)
#' @param p = population proportion (use 0.5 if unknown)
#' @param e = margin of error (e.g. 0.05 for 5%)

# Initial sample size without finite population correction
n <- (z^2 * p * (1 - p)) / (e^2)

# Adjusted sample size for finite population
adjusted_size <- n / (1 + (n / N))

# Return the ceiling of the adjusted sample size.
print(ceiling(adjusted_size))
}

# We will now use the function to determine a sample size for the mot_clean data, based on a 99.99% confidence interval and 1% margin of error.
# As we set the mot_clean_size variable we will input this as N.

sample_size <- sample_size_finite(mot_clean_size, 3.89, 0.5, 0.01)



fraction <- sample_size/mot_clean_size %>% print()
mot_sample <- mot_clean %>% sparklyr::sdf_sample(fraction, replacement=FALSE, seed = 99)
mot_sample %>% sparklyr::sdf_nrow()





# Read in the sample data ready for EDA
mot_eda_sample <- read.csv("D:/repos/ons-spark/ons-spark/data/mot_eda_sample_0.1.csv")

# Check schema and preview data
pillar::glimpse(mot_eda_sample)




# Again, check the data types of each column in the dataframe, change them if necessary

mot_eda_sample <- mot_eda_sample %>%
dplyr::mutate(test_date = as.Date(test_date)) %>%
dplyr::mutate_at(c("make", "colour"), as.factor)




summary <- summary(mot_eda_sample)
summary




correlation_data <- mot_eda_sample[, c("vehicle_id", "test_mileage", "cylinder_capacity")]
cor(correlation_data)




colour_percentage <- mot_eda_sample %>%
dplyr::group_by(colour) %>%
dplyr::summarise(count = length(colour)) %>%
dplyr::mutate(percentage = count/(nrow(mot_eda_sample))*100) %>%
arrange(desc(percentage))




mean_mileage <- mot_eda_sample %>%
group_by(colour) %>%
summarise(mean_mileage = mean(test_mileage, na.rm = TRUE)) %>%
arrange(desc(mean_mileage)) %>%
print()




library(ggplot2)

ggplot(mean_mileage, aes(x = reorder(colour, -mean_mileage), y = mean_mileage)) +
geom_bar(stat = "identity", fill = "skyblue") +
geom_text(aes(label = round(mean_mileage, 0)), vjust = 1, angle = 45) +
labs(
title = "Mean Mileage by Car Colour",
x = "Car Colour",
y = "Mean Mileage"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

Loading