shalek2013_expression = pd.read_table('GSE41265_allGenesTPM.txt.gz',
index_col=0,
compression='gzip')
shalek2013_expression.head()###查看
#####設(shè)置顯示的結(jié)果
pd.options.display.max_columns = 50
pd.options.display.max_rows = 50
shalek2013_expression.head()
shalek2013_expression###查看數(shù)據(jù)的維度
#####讀入注釋文件
shalek2013_metadata = pd.read_table('~/Downloads/GSE41265_series_matrix.txt',
skiprows=33,
index_col=0)
shalek2013_metadata
####轉(zhuǎn)置
shalek2013_metadata = shalek2013_metadata.T
shalek2013_metadata
shalek2013_metadata.index與shalek2013_metadata.columns分別是行名與列名,與R中的rownames,colnames對應(yīng)
####整理列名
[x.strip('!') for x in shalek2013_metadata.columns]
上面的代碼可以用函數(shù)做到
def remove_exclamation(x):
return x.strip('!')
shalek2013_metadata.columns.map(remove_exclamation)
####賦值
shalek2013_metadata.columns = shalek2013_metadata.columns.map(lambda x: x.strip('!'))
shalek2013_metadata.head(8)####顯示前8行
####畫圖并保存圖像
sns.boxplot(shalek2013_expression)
# gcf = Get current figure
fig = plt.gcf()
fig.savefig('shalek2013_expression_boxplot.pdf')
#####
expression_logged < 10
expression_at_most_10 = expression_logged[expression_logged < 10]
expression_at_most_10
####質(zhì)控QC,pd操作都是基于列,要想對行做操作需要設(shè)置axis=1
genes_of_interest = (expression_logged > 1).sum(axis=1) >= 3
expression_filtered_by_all_samples = expression_logged.loc[genes_of_interest]###行的選擇
print(expression_filtered.shape)
expression_filtered.head()
sns.boxplot(expression_filtered_by_all_samples)
# gcf = Get current figure
fig = plt.gcf()
fig.savefig('expression_filtered_by_all_samples_boxplot.pdf')
#####對列(細胞)進行質(zhì)控
pooled_ids = [x for x in expression_logged.columns if x.startswith('P')]
###python code 的簡潔性pooled = expression_logged[pooled_ids]###默認是列的操作,而行則是要加loc,等同于expression_logged.loc[:, pooled_ids].head()
#######以上的QC均是在所有的基礎(chǔ)上,the following code refer to single
single_cell=[x for x in expression_logged.columns if x.startswith('S')]
expression_by_single_cells=expression_logged[single_cell]
gene_select=(expression_by_single_cells>1).sum(axis=1)>3
expression_filtered_by_singles=expression_by_single_cells.loc[gene_select]
Assert expression_filtered_by_singles.shape==(6312, 21)