In this notebook, I download the luminal A and luminal B cancer samples from TCGA, view it, trim it to keep only the most variable genes across specified expression bins, and save the new datasets for simplicity.
import pandas as pd
import numpy as np
import scanpy as sc
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rc
rc('text', usetex=True)
rc('text.latex', preamble=r'\usepackage{cmbright}')
rc('font', **{'family': 'sans-serif', 'sans-serif': ['Helvetica']})
%matplotlib inline
# This enables SVG graphics inline.
%config InlineBackend.figure_formats = {'png', 'retina'}
rc = {'lines.linewidth': 2,
'axes.labelsize': 18,
'axes.titlesize': 18,
'axes.facecolor': 'DFDFE5'}
sns.set_context('notebook', rc=rc)
sns.set_style("dark")
mpl.rcParams['xtick.labelsize'] = 16
mpl.rcParams['ytick.labelsize'] = 16
mpl.rcParams['legend.fontsize'] = 14
df = pd.read_csv('../luminalAB/mrna/brca_luma_vs_lumb_mrna_tmm_311samples.csv')
df.set_index('geneid', inplace=True)
print('There are {0} genes, and {1} individuals'.format(*df.shape))
df.head()
meta = pd.read_csv('../luminalAB/mrna/lumAB_id_cancer.txt', sep='\t')
meta.ID = meta.ID.astype('category')
meta.ID.cat.set_categories(df.columns, inplace=True)
meta.sort_values("ID", inplace=True)
meta.head()
meta.groupby('cancer').count()
plt.scatter(df.mean(axis=1), df.std(axis=1) / df.mean(axis=1),
s=5, alpha=0.1)
plt.xlabel('$\mu$')
plt.ylabel('$\sigma / \mu$')
plt.xscale('log')
plt.hist(df.values.flatten())
plt.xlabel('TMM')
plt.ylabel('Frequency')
plt.yscale('log')
plt.hist((df + 1).apply(np.log10).values.flatten())
plt.xlabel('$\log_{10}$(TMM + 1)')
plt.ylabel('Frequency')
plt.yscale('log')
logdf = (df + 1).apply(np.log10)
normed = ((logdf - logdf.mean(axis=1).values[:, np.newaxis]) / logdf.std(axis=1).values[:, np.newaxis])
sns.kdeplot(normed.values.flatten(), label='data')
sns.kdeplot(np.random.normal(0, 1, len(normed)), label='normal dist')
_ = plt.xlabel('Standardized, scaled log(TMM + 1)')
found = (df > 0).sum(axis=1)
plt.hist(found)
plt.yscale('log')
plt.xlabel('Number of samples gene detected in')
plt.ylabel('Frequency')
print('{0} genes show up in <300 samples'.format((found < 300).sum()))
df = df[found > 300]
# split genes into expression percentiles:
percentile = 0.6
bins = np.logspace(0, 4, 10)
digital = np.digitize(df.mean(axis=1).values, bins=bins)
bin_means = np.array([df.mean(axis=1).values[digital == i].mean() for i in range(1, len(bins))])
# mean and coef of variation:
mean = df.mean(axis=1).values
cv = (df.std(axis=1) / df.mean(axis=1))
# for each bin, keep the top `percentile` most variable genes:
keep = []
for i in range(1, len(bins)):
curr_cv = cv[digital == i].copy()
idx = np.argsort(curr_cv)
q = np.quantile(curr_cv, percentile)
sel = (curr_cv > q)
keep += curr_cv[sel].index.values.tolist()
# plot genes by expression bin:
plt.scatter(mean[digital == i][~sel], cv[digital == i][~sel], s=5, alpha=0.1)
plt.scatter(mean[digital == i][sel], cv[digital == i][sel], s=5, alpha=0.1, color='black')
plt.xscale('log')
plt.xlabel('$\mu$')
plt.ylabel('$\sigma / \mu$')
df = df.loc[keep]
print('Selecting variable genes at the {0} percentile'.format(percentile))
print('There are {0} genes post-trimming'.format(len(df)))
logdf = (df + 1).apply(np.log10)
scaled = (logdf - logdf.mean(axis=1).values[:, np.newaxis]) / logdf.std(axis=1).values[:, np.newaxis]
df.to_csv('../luminalAB/mrna/trimmedTMM.csv')
logdf.to_csv('../luminalAB/mrna/logTMM.csv')
scaled.to_csv('../luminalAB/mrna/NormalizedlogTMM.csv')