Let's visualize how the most important features are behaving in luminal cancers.
import pandas as pd
import numpy as np
import scanpy as sc
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rc
from sklearn.ensemble import GradientBoostingClassifier
rc('text', usetex=True)
rc('text.latex', preamble=r'\usepackage{cmbright}')
rc('font', **{'family': 'sans-serif', 'sans-serif': ['Helvetica']})
%matplotlib inline
# This enables SVG graphics inline.
%config InlineBackend.figure_formats = {'png', 'retina'}
rc = {'lines.linewidth': 2,
'axes.labelsize': 18,
'axes.titlesize': 18,
'axes.facecolor': 'DFDFE5'}
sns.set_context('notebook', rc=rc)
sns.set_style("dark")
mpl.rcParams['xtick.labelsize'] = 16
mpl.rcParams['ytick.labelsize'] = 16
mpl.rcParams['legend.fontsize'] = 14
ensembl = sc.queries.biomart_annotations(org='hsapiens',
attrs=['ensembl_gene_id', 'gene_biotype',
'external_gene_name', 'description'])
df = pd.read_csv('../luminalAB/mrna/trimmedTMM.csv', index_col='geneid')
logdf = pd.read_csv('../luminalAB/mrna/logTMM.csv', index_col='geneid')
normed = pd.read_csv('../luminalAB/mrna/NormalizedlogTMM.csv', index_col='geneid')
meta = pd.read_csv('../luminalAB/mrna/lumAB_id_cancer.txt', sep='\t')
meta.ID = meta.ID.astype('category')
meta.ID.cat.set_categories(df.columns, inplace=True)
meta.sort_values("ID", inplace=True)
ensembl = ensembl[ensembl.ensembl_gene_id.isin(df.index)]
gbc = pd.read_csv('../luminalAB/ensemble_genes.txt', header=None, names=['gene'])
lgt = pd.read_csv('../luminalAB/logistic_genes.txt', header=None, names=['gene'])
intsct = np.intersect1d(gbc.gene, lgt.gene)
ens_intsct = ensembl[ensembl.external_gene_name.isin(intsct)].ensembl_gene_id.values
ens_intsct
df = df.loc[ens_intsct]
tidy = df.reset_index().melt(id_vars='geneid', var_name='Sample', value_name='TMM').set_index('Sample')
tidy.shape
tidy = tidy.join(meta.set_index('ID'), how='outer').reset_index()
tidy = tidy.join(ensembl.set_index('ensembl_gene_id'), on=['geneid'], how='outer')
tidy.columns = ['Sample', 'geneid', 'TMM', 'cancer', 'gene_biotype', 'gene', 'description']
tidy['cancer'] = tidy.cancer.str.replace('_', '-')
tidy.head()
tidy = tidy[~tidy.geneid.str.contains('94755')] # this has an extreme outlier that makes visualization difficult.
mean_exp = tidy.groupby('geneid').TMM.mean()
high_exp = mean_exp[mean_exp > 10].index
fig, ax = plt.subplots(figsize=(16, 4))
sns.boxplot(y='TMM', x='gene', hue='cancer', data=tidy[tidy.geneid.isin(high_exp)])
_ = plt.xticks(rotation=60)