<hr/>2.5 CDR3 Clonotypes Richness:
import seaborn as sns, matplotlib.pyplot as plt
########################################################################################################################
input_path='/Users/ZYP/Downloads/imm_repertoire/vdjtools_result/'#输入路径
output_path='/Users/ZYP/Downloads/imm_repertoire/vdjtools_result/'#输出路径
sample_info='sample_info.txt'#记录有样本分组的样本信息
########################################################################################################################
list_pos=[]
for line in open(input_path+sample_info,'r'):
info=line[:-1].split('\t')
if info[0]!='ID':
if str(info[1])=='Class1':
list_pos.append(info[0])
list_class=[]
list_sample=[]
list_richness=[]
for line in open(output_path+'basicstats.txt','r'):
info=line[:-1].split('\t')
if info[0]!='sample_id':
if info[0].split('.')[0] in list_pos:
list_class.append('Class1')
else:
list_class.append('Class2')
list_sample.append(info[0].split('.')[0])
list_richness.append(float(info[3]))
<hr/>2.6 CDR3 Clonotypes Length:
import seaborn as sns, matplotlib.pyplot as plt
########################################################################################################################
input_path='/Users/ZYP/Downloads/imm_repertoire/vdjtools_input/'#输入路径
output_path='/Users/ZYP/Downloads/imm_repertoire/vdjtools_result/'#输出路径
sample_info='sample_info.txt'#记录有样本分组的样本信息
########################################################################################################################
dict_sample_LengthCount={}
list_pos=[]
for line in open(input_path+sample_info,'r'):
info=line[:-1].split('\t')
id=info[0]
if id!='ID':
if str(info[1])=='Class1':
list_pos.append(id)
read_file=open(input_path+id+'.clonotypes.TRB.txt','r')
dict_length_count={}
for line in read_file:
if line[0]!='c':
length=len(line[:-1].split('\t')[3])
if length < 30:
if length not in dict_length_count.keys():
dict_length_count[length]=1
else:
dict_length_count[length]+=1
dict_sample_LengthCount[id]=dict_length_count
read_file.close()
list_length=[]
list_class=[]
list_sample=[]
list_count=[]
for sample in dict_sample_LengthCount.keys():
dict_length_count=dict_sample_LengthCount[sample]
for length in dict_length_count.keys():
list_length.append(length)
list_sample.append(sample)
if sample in list_pos:
list_class.append('Class1')
else:
list_class.append('Class2')
list_count.append(dict_length_count[length])
<hr/>2.8 The relationship between CDR3 Abundance and CDR3 Clonotypes Richness:
import seaborn as sns, matplotlib.pyplot as plt
########################################################################################################################
input_path='/Users/ZYP/Downloads/imm_repertoire/vdjtools_input/'#输入路径
output_path='/Users/ZYP/Downloads/imm_repertoire/vdjtools_result/'输出路径
sample_info='sample_info.txt'#记录有样本分组的样本信息
########################################################################################################################
dict_sample_AbundanceRichness={}
list_pos=[]
for line in open(input_path+sample_info,'r'):
info = line[:-1].split('\t')
id = info[0]
if id != 'ID':
if str(info[1]) == 'Class1':
list_pos.append(id)
dict_abundance_richness={}
read_file = open(input_path + id + '.clonotypes.TRB.txt', 'r')
for line in read_file:
if line[0]!='c':
info=line[:-1].split('\t')
abundance=info[0]
if abundance not in dict_abundance_richness.keys():
dict_abundance_richness[abundance]=1
else:
dict_abundance_richness[abundance]+=1
dict_sample_AbundanceRichness[id]=dict_abundance_richness
read_file.close()
list_abundance=[]
list_richniess=[]
list_sample=[]
list_class=[]
for sample in dict_sample_AbundanceRichness.keys():
dict_abundance_richness=dict_sample_AbundanceRichness[sample]
for abundance in dict_abundance_richness.keys():
list_sample.append(sample)
if sample in list_pos:
list_class.append('Class1')
else:
list_class.append('Class2')
list_abundance.append(int(abundance))
list_richniess.append(int(dict_abundance_richness[abundance]))
The relationship between CDR3 Abundance and CDR3 Clonotypes Richness
<hr/>2.9 CDR3 Overlap:
2.9.1 计算样本间CDR3重叠部分并写入本地文件overlapped_CDR3.txt:
########################################################################################################################
input_file=open('/Users/ZYP/Downloads/imm_repertoire/vdjtools_result/intersect.batch.aa.txt','r')#请将带有绝对路径的CalcPairwiseDistances的输出文件写在此处的''内
save_file=open('/Users/ZYP/Downloads/imm_repertoire/vdjtools_result/overlapped_CDR3.txt','w')#请将带有绝对路径的输出文件写在此处的''内
########################################################################################################################
list_samples=[]
diction={}
row=1
for line in input_file:
if row!=1:
info=line[:-1].split('\t')
for sample in [info[0],info[1]]:
if sample not in list_samples:
list_samples.append(sample)
key=info[0]+'and'+info[1]
value=info[4]
diction[key]=value
row+=1
input_file.close()
word=''
for sample in list_samples:
sample=sample.split('.')[0]
word+=sample+'\t'
save_file.write(word[:-1]+'\n')
for sample_row in list_samples:
word=''
for sample_column in list_samples:
if sample_row+'and'+sample_column in diction.keys():
word+=str(diction[sample_row+'and'+sample_column])+'\t'
else:
if sample_column+'and'+sample_row in diction.keys():
word+=str(diction[sample_column+'and'+sample_row])+'\t'
else:
word+='0'+'\t'
word=word[:-1]+'\n'
save_file.write(word)
save_file.close()2.9.2 CDR3 Overlap-heatmap:
import seaborn as sns, pandas as pd, matplotlib.pyplot as plt
########################################################################################################################
input_file='/Users/ZYP/Downloads/imm_repertoire/vdjtools_result/overlapped_CDR3.txt'#请将带有绝对路径的输入文件写在此处的''内
output_file='/Users/ZYP/Downloads/imm_repertoire/vdjtools_result/overlapped_CDR3.pdf'#请将带有绝对路径的输出文件写在此处的''内
########################################################################################################################
list_samples=[]
list_value_list=[]
number_row=1
for line in open(input_file):
if number_row!=1:
list_value=[]
index=0
for value in line[:-1].split('\t'):
list_value.append(float(value))
index+=1
list_value_list.append(list_value)
else:
for sample in line[:-1].split('\t'):
list_samples.append(sample)
number_row+=1
data=pd.DataFrame(data=list_value_list,index=list_samples,columns=list_samples)
ax=sns.clustermap(data,standard_scale=1,figsize=(19,19))
plt.savefig(output_file)
plt.show()结果如下图所示:
<hr/>2.10 CDR3 diversity / evenness / clonality:
2.10.1 计算diversity / evenness / clonality 并可视化:
import math,seaborn as sns,matplotlib.pyplot as plt
########################################################################################################################
input_path='/Users/ZYP/Downloads/imm_repertoire/vdjtools_input/'#输入路径
output_path='/Users/ZYP/Downloads/imm_repertoire/vdjtools_result/'输出路径
sample_info='sample_info.txt'#记录有样本分组的样本信息
########################################################################################################################
def Shannon_entropy(list_frequency):
sum=0
for frequency in list_frequency:
sum+=frequency*math.log(frequency)
H=-sum
return H
def Pielou_evenness(H,richness):
E=H/math.log(richness)
return E
def Clonality(E):
C=1-E
return C
list_pos=[]
list_characteristic=[]
list_value=[]
list_sample=[]
list_class=[]
for line in open(input_path+sample_info,'r'):
info = line[:-1].split('\t')
id = info[0]
if id != 'ID':
if info[1] == 'Class1':
list_pos.append(id)
read_file = open(input_path + id + '.clonotypes.TRB.txt', 'r')
richness=0
TotalCount=0
for line in read_file:
if line[0] != 'c':
count = int(line[:-1].split('\t')[0])
richness+=1
TotalCount+=count
read_file.close()
read_file = open(input_path + id + '.clonotypes.TRB.txt', 'r')
list_frequency = []
for line in read_file:
if line[0] != 'c':
count = int(line[:-1].split('\t')[0])
list_frequency.append(count/TotalCount)
diversity=Shannon_entropy(list_frequency)
evenness=Pielou_evenness(diversity,richness)
clonality=Clonality(evenness)
list_characteristic+=['diversity','evenness','clonality']
list_value+=[diversity,evenness,clonality]
for time in range(0,3):
list_sample.append(id)
if id in list_pos:
list_class.append('Class1')
else:
list_class.append('Class2')
read_file.close()
<hr/>2.11 DR3 length distribution for 1 sample:
2.10.1 CDR3 length distribution among CDRs:
java -jar /path/to/vdjtools/vdjtools.jar
PlotFancySpectratype -t 20 sample1.txt /path/to/vdjtools_result/结果如下图所示:
CDR3 length distribution among CDRs
2.11.2 CDR3 length distribution among genes based on clonotype / frequency:
#CDR3 length distribution based on clonotype number among Vgenes
java -jar /path/to/vdjtools/vdjtools.jar
PlotSpectratypeV -u sample1.txt /path/to/vdjtools_result/
#CDR3 length distribution based on read frequency among Vgenes
java -jar /path/to/vdjtools/vdjtools.jar
PlotSpectratypeV sample1.txt /path/to/vdjtools_result/结果如下图所示:
CDR3 length distribution among genes based on clonotype / frequency
<hr/>2.12 sample similarity based on CDR3:
java -jar /path/to/vdjtools/vdjtools.jar
ClusterSamples -p -l /path/to/vdjtools_result/ /path/to/vdjtools_result/结果如下图所示:
sample_info=open(path+'vdjtools_result/sample_info.txt','w')
sample_info.write('ID\tlabel\tbatch\n')
list_files=[]
for line in open(path+'vdjtools_input/metadata.txt','r'):
if line[0]!='f':
file=line.split('\t')[1]
sample=file.split('.')[0]
list_files.append(file+'.txt')
if int(sample[-3:])<100:
sample_info.write(sample+'\t'+'Class1'+'\t'+'1'+'\n')
else:
sample_info.write(sample+'\t'+'Class2'+'\t'+'1'+'\n')
sample_info.close()
list_sample=[]
list_V=[]
list_J=[]
list_dict_V=[]
list_dict_J=[]
for file in list_files:
read_file=open(path+'vdjtools_input/'+file,'r')
sample=file.split('.')[0]
list_sample.append(sample)
dict_CDR={}
dict_V={}
dict_J={}
for line in read_file:
if line[0]!='c':
info=line[:-1].split('\t')
name_V=info[4]
name_J=info[6]
if name_V not in list_V:
list_V.append(name_V)
if name_J not in list_J:
list_J.append(name_J)
if name_V not in dict_V.keys():
dict_V[name_V]=1
else:
dict_V[name_V]+=1
if name_J not in dict_J.keys():
dict_J[name_J]=1
else:
dict_J[name_J]+=1
list_dict_V.append(dict_V)
list_dict_J.append(dict_J)
read_file.close()
word = 'item'
for sample in list_sample:
word+='\t'+sample
word+='\n'
V_file=open('/Users/ZYP/Downloads/imm_repertoire/vdjtools_result/matrix_V.txt','w')
V_file.write(word)
for V in list_V:
check = []
info=str(V)
for dict_V in list_dict_V:
if V in dict_V.keys():
info+='\t'+str(dict_V[V])
check.append(float(dict_V[V]))
else:
info+='\t'+'0'
check.append(0)
info+='\n'
if max(check)!=0:
V_file.write(info)
V_file.close()
print('V gene expression matrix have been build!')
J_file=open('/Users/ZYP/Downloads/imm_repertoire/vdjtools_result/matrix_J.txt','w')
J_file.write(word)
for J in list_J:
check = []
info=str(J)
for dict_J in list_dict_J:
if J in dict_J.keys():
info+='\t'+str(dict_J[J])
check.append(float(dict_J[J]))
else:
info+='\t'+'0'
check.append(0)
info+='\n'
if max(check)!=0:
J_file.write(info)
J_file.close()
print('J gene expression matrix have been build!')
print('expression matrix have been build!')输出文件格式如下:
2.15.3 Volcanoplot:
import seaborn as sns, matplotlib.pyplot as plt, pandas as pd, numpy as np
########################################################################################################################
input_file='/Users/ZYP/Downloads/imm_repertoire/vdjtools_result/diffexp_result_J.txt'#请将带有绝对路径的输入文件写在此处的''内
output_file='/Users/ZYP/Downloads/imm_repertoire/vdjtools_result/diffexp_VolcanoPlot_J.pdf'#请将带有绝对路径的输出文件写在此处的''内
########################################################################################################################
list_log2FC=[]
list_padj=[]
index_log2FC=2
index_padj=6
number_row=1
for line in open(input_file):
if number_row==1:
index_log2FC=line[:-1].split('\t').index('log2FoldChange')
index_padj=line[:-1].split('\t').index('padj')
else:
if line.split('\t')[index_log2FC]!='NA' and line.split('\t')[index_padj]!='NA':
list_log2FC.append(float(line.split('\t')[index_log2FC]))
list_padj.append(float(line.split('\t')[index_padj]))
number_row+=1
foldchange=np.asarray(list_log2FC)
pvalue=np.asarray(list_padj)
2.15.4 Heatmap:
import seaborn as sns
import pandas as pd
from matplotlib import pyplot as plt
########################################################################################################################
input_file='/Users/ZYP/Downloads/imm_repertoire/vdjtools_result/padj<0.05&log2FC>1or<-1_V.txt'#请将带有绝对路径的输入文件写在此处的''内
output_file='/Users/ZYP/Downloads/imm_repertoire/vdjtools_result/diffexp_heatmap_V.pdf'#请将带有绝对路径的输出文件写在此处的''内
sample_info='/Users/ZYP/Downloads/imm_repertoire/vdjtools_result/sample_info.txt'#请将带有绝对路径的样本分组文件写在此处的''内
########################################################################################################################
list_pos=[]
for line in open(sample_info,'r'):
info=line[:-1].split('\t')
if info[0]!='ID':
if str(info[1])=='Class1':
list_pos.append(info[0])
list_ids=[]
list_samples=[]
list_value_list=[]
list_sample_colors=[]
list_gene_colors=[]
begin_samples=7
index_log2FC=2
index_p=5
number_row=1
for line in open(input_file):
if number_row!=1:
list_ids.append(line[:-1].split('\t')[0])
list_value=[]
for value in line[:-1].split('\t')[begin_samples:]:
list_value.append(float(value))
list_value_list.append(list_value)
log2FC=float(line[:-1].split('\t')[index_log2FC])
p=float(line[:-1].split('\t')[index_p])
if p<0.05 and log2FC>1:
list_gene_colors.append(sns.xkcd_rgb["dark red"])
if p<0.05 and log2FC<-1:
list_gene_colors.append(sns.xkcd_rgb["marine blue"])
else:
begin_samples = line[:-1].split('\t').index('padj') + 1
index_log2FC = line[:-1].split('\t').index('log2FoldChange')
index_padj = line[:-1].split('\t').index('padj')
for sample in line[:-1].split('\t')[begin_samples:]:
list_samples.append(sample)
if sample in list_pos:
list_sample_colors.append(sns.xkcd_rgb["dark red"])
else:
list_sample_colors.append(sns.xkcd_rgb["marine blue"])
number_row+=1
print(len(list_sample_colors))
print(len(list_gene_colors))
data=pd.DataFrame(data=list_value_list,index=list_ids,columns=list_samples)
print(data)
ax=sns.clustermap(data,standard_scale=1,row_colors=list_gene_colors,col_colors=list_sample_colors)
plt.savefig(output_file)
plt.show()结果如下图所示:
Heatmap of diff-expressed V genes (there were no diff-expressed J genes)