Answers
Answer 1
# read in raw counts file
raw_counts <- read_csv("data/GSE60450_GeneLevel_Raw_data.csv")
## New names:
## Rows: 23735 Columns: 14
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): ...1, gene_symbol dbl (12): GSM1480291, GSM1480292, GSM1480293,
## GSM1480294, GSM1480295, GSM148...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
# format the data like we did in section 5 and 6 for plotting
raw_seqdata <- pivot_longer(raw_counts, cols = starts_with("GSM"), names_to = "Sample",
values_to = "Count")
raw_allinfo <- full_join(raw_seqdata, sampleinfo, by = join_by(Sample == sample_id))
# add shortened category names
raw_allinfo <- mutate(raw_allinfo, Group = case_when(
str_detect(characteristics, "basal.*virgin") ~ "bvirg",
str_detect(characteristics, "basal.*preg") ~ "bpreg",
str_detect(characteristics, "basal.*lact") ~ "blact",
str_detect(characteristics, "luminal.*virgin") ~ "lvirg",
str_detect(characteristics, "luminal.*preg") ~ "lpreg",
str_detect(characteristics, "luminal.*lact") ~ "llact"
))
# plot a boxplot of the raw counts
ggplot(data = raw_allinfo, mapping = aes(x = Sample, y = log2(Count+1), fill = Group)) +
geom_boxplot()
# filter the data for the same set of 8 genes as we did in section 6
mygenes_raw_counts <- filter(raw_allinfo, gene_symbol %in% mygenes)
# plot jitter plots of the 8 genes, faceted by genes and coloured by group
ggplot(data = mygenes_raw_counts,
mapping = aes(x = Group, y = log2(Count + 1), colour = Group)) +
geom_jitter() +
facet_wrap(~ gene_symbol)
Answer 2
# read in counts file
counts_GSE63310 <- read_csv("data/GSE63310_GeneLevel_Normalized(CPM.and.TMM)_data.csv")
## New names:
## Rows: 23763 Columns: 13
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): ...1, gene_symbol dbl (11): GSM1545535, GSM1545536, GSM1545537,
## GSM1545538, GSM1545539, GSM154...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
colnames(counts_GSE63310)[1] <- "gene_id"
# read in metadata
sampleinfo_GSE63310 <- read_csv("data/GSE63310_filtered_metadata.csv")
## New names:
## Rows: 11 Columns: 4
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (4): ...1, characteristics, strain/background, age
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
colnames(sampleinfo_GSE63310)[1] <- "sample_id"
# format the data
seqdata_GSE63310 <- pivot_longer(counts_GSE63310, cols = -c("gene_id", "gene_symbol"),
names_to = "Sample", values_to = "Count")
allinfo_GSE63310 <- full_join(seqdata_GSE63310, sampleinfo_GSE63310, by = join_by(Sample == sample_id))
# add shortened category names
allinfo_GSE63310 <- mutate(allinfo_GSE63310, Group = case_when(
str_detect(characteristics, "CD29loCD61\\+.*luminal") ~ "l_CD29lo+",
str_detect(characteristics, "CD29loCD61-.*luminal") ~ "l_CD29lo-",
str_detect(characteristics, "CD29hi.*Basal") ~ "b_CD29hi",
str_detect(characteristics, "CD29hi.*mammary") ~ "m_CD29hi",
str_detect(characteristics, "CommaD-beta-geo") ~ "g_CommaD"
))
# Plot boxplots colouring by group
ggplot(data = allinfo_GSE63310, mapping = aes(x = Sample, y = log2(Count + 1), fill = Group)) +
geom_boxplot()
# Plot boxplots colouring by strain/background
ggplot(data = allinfo_GSE63310, mapping = aes(x = Sample, y = log2(Count + 1), fill = `strain/background`)) +
geom_boxplot()
# Plot boxplots colouring by age
ggplot(data = allinfo_GSE63310, mapping = aes(x = Sample, y = log2(Count + 1), fill = age)) +
geom_boxplot()