Protein Panel Expansion
This notebook shows how to transfer customize protein data into correct shape for scLinguist.
0. Imports
[17]:
import sys
sys.path.append('../../')
import numpy as np
import pandas as pd
import anndata as anndata
import scanpy as sc
from pathlib import Path
from scipy.sparse import csr_matrix
from scLinguist.data_loaders.data_loader import expand_protein_to_panel
1. Parameters
[18]:
PANEL_PATH = Path('../../docs/tutorials/protein_index_map.csv')
PROTEIN_PATH = Path('../../data/test_sample_adt.h5ad')
OUTPUT_DIR = Path('../../docs/tutorials/expanded_output')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_H5AD = OUTPUT_DIR / 'pro_expanded_6427.h5ad'
COMPRESSION = 'gzip'
2. Load data & target panel
[21]:
pro = sc.read_h5ad(PROTEIN_PATH)[:, :10] # take 10 proteins as example
panel = pd.read_csv(PANEL_PATH, index_col=None)
panel = panel.name.tolist()
print('Protein AnnData:', pro.shape)
print('Panel length:', len(panel))
Protein AnnData: (10546, 10)
Panel length: 6427
[22]:
panel
[22]:
['SP110',
'GTPBA',
'SNX2',
'FRG1',
'TT21A',
'RHG18',
'AR',
'DOCK1',
'RAB1A',
'MUC1.HMFG2',
'H2B1L',
'RFC1',
'TXTP',
'MER34',
'IL.3',
'FXR2',
'ARMD3',
'ZDHC9',
'KAPCA',
'HH3',
'ZBT21',
'F149B',
'PDE10',
'PCD18',
'PEBB',
'PRI1',
'NU214',
'TNAP',
'JHD2C',
'MIC19',
'CD213A2',
'HYEP',
'ZFR',
'PUF60',
'GPAT3',
'DIDO1',
'CD185',
'AT12A',
'TATD1',
'ZN341',
'RPB3',
'APOC3',
'TCR.VY9',
'HS90B',
'EMAL6',
'WFS1',
'RS9',
'PGK2',
'TYW2',
'EF1B',
'IMB1',
'ACADM',
'PAR6A',
'CAP1',
'CD357',
'RGS7',
'TCTP',
'EXOS9',
'RT34',
'SQSTM',
'TBA1A',
'ENOPH',
'BAP18',
'AQR',
'SMRC1',
'FRIH',
'HS105',
'MORN3',
'TIM23',
'K1143',
'CXB3',
'CD72',
'UBP25',
'ECP',
'EF2',
'NSF1C',
'CD196',
'SPDE3',
'CCR10',
'PON2',
'PURB',
'PLCG2',
'TCPD',
'CPNE3',
'TCRG',
'MAOX',
'PEPL',
'NUF2',
'RYR2',
'LIMC1',
'CHM2B',
'LNX1',
'MARCS',
'ATP5E',
'ZBED6',
'FA47C',
'UBE4B',
'AAMDC',
'CGAS',
'AL1L1',
'PDE1A',
'VIR',
'AKP8L',
'APC',
'PLXD1',
'RNZ2',
'MYPC3',
'MRP5',
'PROF2',
'UBA3',
'ZN599',
'TCR.VB13.2',
'TTC1',
'NOP58',
'RL27A',
'INVO',
'IF16',
'SNX15',
'ARL8B',
'UNC80',
'RMD1',
'CD141',
'CS044',
'MXRA5',
'S10A6',
'LORF2',
'NDKB',
'CD56.1',
'EPB42',
'DIRC2',
'COPZ1',
'ECH1',
'SYNE1',
'MBB1A',
'GTF2I',
'PTN1',
'PGTB2',
'TRPM6',
'CORO7',
'DJC13',
'O14K1',
'PLIN3',
'LDHA',
'PP2BA',
'COPE',
'DPOLA',
'FHL5',
'SNX9',
'F200B',
'CD10',
'MPV17',
'STAT3',
'PAR3L',
'INT13',
'RPB11',
'CTL2',
'PACN1',
'GLYG2',
'KMT2B',
'LXN',
'RFA1',
'DPOG1',
'O10J4',
'SYVC',
'OCAD1',
'FANCM',
'RPC2',
'SYLC',
'TRAP1',
'KIF25',
'ABCA5',
'CP26B',
'CCNK',
'ZNF85',
'MDM4',
'NIPBL',
'HECW1',
'P63',
'ADPRM',
'CD275',
'FAAA',
'TEBP',
'RN103',
'MYO16',
'SAM15',
'SCRB2',
'NCF2',
'NU160',
'CD88',
'KNTC1',
'BTK',
'PPIA',
'CD300',
'CSTN3',
'CADH4',
'CO4A1',
'DGKQ',
'THYN1',
'ALDR',
'BMX',
'PRPS1',
'KMO',
'CD85J',
'PPR3A',
'ZO3',
'BAZ1A',
'K1C19',
'F120C',
'RNF12',
'VILI',
'GYS2',
'DHRS9',
'PDE3A',
'ENOG',
'PSMD2',
'MCFD2',
'VISTA',
'DCC1',
'TPPC8',
'KPRA',
'ATD3A',
'BRCA2',
'P5F1B',
'CAND1',
'AT11B',
'IGBP1',
'ARLY',
'GBP7',
'SYNEM',
'ZAN',
'NUP98',
'PDS5A',
'IF4A1',
'BRD8',
'MPRIP',
'ANR62',
'MINY3',
'ATAD1',
'KGD4',
'CYTSB',
'TCR.VBETA13.1',
'RL12',
'CUL1',
'PI42B',
'PLCB3',
'LMTK1',
'TUT4',
'SPSY',
'SSRG',
'MYOF',
'PAAT',
'GCP2',
'ECI2',
'ZMYM1',
'GAB3',
'DHB11',
'SGT1',
'TAM41',
'IGG4.IH',
'SCFD1',
'GUAA',
'NFIL3',
'CHM1A',
'DUS10',
'PUS1',
'PPRC1',
'VEGFA',
'MSH5',
'DQX1',
'BAG3',
'SAP',
'F184B',
'KRIT1',
'FXL19',
'NUP50',
'HS71L',
'MTUS2',
'GPTC4',
'PGC1A.P',
'CO8B',
'E2AK2',
'IMDH1',
'PTK7',
'STK24',
'ILK',
'ABCA7',
'PDPN',
'TXN4A',
'HMR1',
'RL26',
'DIRA3',
'BUB3',
'ACTS',
'ZN626',
'ARP19',
'ILKAP',
'PPAC',
'MYOM1',
'MRP1',
'SC2B2',
'CNN3',
'PAI2',
'THIOM',
'VPS41',
'CDK6',
'STK31',
'CAV1',
'SCPDL',
'CHIKV',
'COCA1',
'SMRD1',
'PRG4',
'WASP',
'PCSK9',
'IL.21',
'UGGG1',
'KI18A',
'ENO1',
'ZN292',
'MAGT1',
'CD282',
'ASPDH',
'ARSG',
'RT23',
'CL079',
'CX3CR1',
'NDUBB',
'FBP1L',
'ECI1',
'CLK1',
'CRNL1',
'ACSL3',
'AFAD',
'RB11A',
'STRAB',
'CC90B',
'1433G',
'CADHERIN',
'RRAS2',
'RTL1',
'GADL1',
'DSG4',
'GRSF1',
'TCRB',
'COA3',
'PRC2B',
'NXPE2',
'ATPD',
'ECM29',
'TDH',
'CL004',
'AJM1',
'UQCC1',
'LYRM2',
'CDN2A',
'SAHH',
'RBM4',
'RL18',
'OVOL2',
'TE2IP',
'GYS1',
'PPM1D',
'ANTR2',
'RHG31',
'PSMD4',
'CD85A',
'LIN41',
'RL7L',
'CRK',
'DHX16',
'TBC31',
'DAPK1',
'EVPL',
'CPNE1',
'LAMA3',
'LRRN1',
'RL18A',
'CAN14',
'EFL1',
'RL19',
'RL22',
'THTR',
'ALG1',
'CS',
'ARP5L',
'EMAL4',
'TS1R1',
'TNPO1',
'COR1B',
'MYLK',
'ALG3',
'CD35',
'NRBP',
'COPA',
'CD229',
'DCTP1',
'RYBP',
'TX1B3',
'TRI60',
'WASF3',
'STIL',
'IKZF1',
'BCCIP',
'ZMY15',
'SNED1',
'ALPK3',
'CPNE6',
'GCC2',
'DDX6',
'FLOT2',
'LONP2',
'CELR3',
'DPP8',
'LAP',
'VPS4B',
'SH3G1',
'DNAS1',
'MTND',
'CD8',
'CISD2',
'CD90',
'COX5A',
'EIF2A',
'PSMD1',
'ADCK1',
'YK004',
'ATX2L',
'DLDH',
'NDST3',
'CENPC',
'STK19',
'PARVB',
'DDX49',
'F157A',
'OVGP1',
'SYUA',
'LMTK3',
'CD85H',
'TCPQ',
'NOL11',
'R10B1',
'FAKD4',
'APLP2',
'NUCB1',
'SEM3C',
'GLYG',
'TOM20',
'CD257',
'PABP5',
'RM53',
'HECD1',
'CD133',
'PPIL3',
'KCC4',
'AUTS2',
'PCH2',
'CD62L',
'SNRK',
'NPM3',
'CD210',
'CD11A',
'RT24',
'ARVC',
'MBNL2',
'PAQR1',
'HCK',
'ELMO3',
'PGAP1',
'RS18',
'CD252',
'CHFR',
'NELL1',
'PEX6',
'TTLL8',
'RGS8',
'TCP4',
'KEAP1',
'KIN17',
'SAE1',
'GARL3',
'PCD19',
'ZNF81',
'NFAT',
'CLEC1B',
'IL17F',
'SPT5H',
'CD7',
'SMAP',
'ADT4',
'TTC17',
'VATE1',
'CD123',
'ZN641',
'AGRV1',
'NH2L1',
'VWC2L',
'UHRF2',
'HDC',
'CD44',
'TSN11',
'UAP1',
'NRDC',
'FKBP4',
'CFA54',
'GDE',
'TCR.GAMMA',
'ARMT1',
'HEAT3',
'AT5EL',
'SYBU',
'CA2D4',
'CD107B',
'RS5',
'CD178',
'PP1G',
'GCNA',
'S1PROBE',
'ZFAT',
'PK3CA',
'ATX10',
'MED24',
'ARP3',
'PHF5A',
'VAMP5',
'SPICE',
'PPR3D',
'RXRG',
'PSB5',
'CIR1',
'CNGB3',
'TBA4A',
'NID1',
'CS2IP',
'NEXN',
'EST1',
'H2A1D',
'RAGP1',
'IST1',
'THOC4',
'GALECTIN9',
'GMPR1',
'STRC',
'TCPA',
'SLIK3',
'AKP13',
'SET1B',
'HEBP1',
'SCAM2',
'ESYT2',
'SNR40',
'FKBP9',
'IFNA',
'RT10',
'CBPD',
'TMA7',
'HLA.E',
'RTN3',
'SG3A2',
'INTEGRIN.A9B1',
'CRML',
'PRP16',
'AP1M2',
'PCX1',
'SLIK4',
'CD94',
'CD276',
'AK1C3',
'EMAL2',
'PEX5',
'VINEX',
'MEX3D',
'SQOR',
'RBM10',
'FAT2',
'PP2AB',
'TBB2B',
'RPR1B',
'ZC3H3',
'ANXA1',
'SLIK1',
'RS16',
'CD16B',
'FXR1',
'TP53B',
'BTNLA',
'RPB7',
'RALY',
'PSB4',
'CP250',
'EIF3J',
'CD365',
'MTPN',
'AT11A',
'DCAF5',
'ZN840',
'TM52B',
'ARFG2',
'ZN596',
'SHLB2',
'PSMF1',
'HNRPQ',
'SYCC',
'IDUA',
'SYQ',
'UN13D',
'CCD70',
'PRRT4',
'CC124',
'DREB',
'PTH2R',
'SAM14',
'F162A',
'ZN425',
'PDE4A',
'TOM22',
'DX39B',
'NUDT5',
'CABP7',
'S29P2',
'CSN3',
'NOL10',
'MTMR6',
'RLA1',
'APC7',
'PFD6',
'NBEA',
'NCOA5',
'ACADS',
'BRCA1',
'HARB1',
'CD019',
'MATR3',
'IQGA1',
'PRS8',
'CLD10',
'CX7A2',
'NUDC2',
'P3C2B',
'ARPC5',
'GPCP1',
'AUHM',
'TRRAP',
'TCR.Y.D',
'S6',
'LTOR5',
'KRR1',
'TCPR1',
'WBP1',
'FGD6',
'BTF3',
'HPLN1',
'PFD4',
'ZC11A',
'WDR82',
'CD215',
'RLA0',
'TBX2',
'DDX56',
'INSL3',
'ZNT3',
'SERP3',
'METK2',
'CTBL1',
'POF1B',
'GRAP2',
'PLAK',
'PESC',
'HMMR',
'CB078',
'CRBG3',
'OSGEP',
'CD108',
'H1X',
'CD352',
'RPGF5',
'5MP2',
'TFAP4',
'ZN519',
'CD45RB',
'ZN608',
'TENS1',
'EIF3E',
'IFNA2',
'PRA33',
'CF20D',
'ATP5J',
'LRRF1',
'UBX2A',
'UQCC2',
'CA159',
'MUCL3',
'AKTS1',
'RAB8B',
'PLPL2',
'VSXL2',
'ABCAC',
'DYST',
'RNFT2',
'UTP4',
'HXC10',
'CFA74',
'CPIN1',
'RASN',
'FBSL',
'PRP8',
'CD182',
'DNJC2',
'TCR.VA7',
'SH3B4',
'RS8',
'NDKA',
'ESPNL',
'MEG10',
'SNX29',
'RPB9',
'UT14A',
'UBF1',
'FKBP5',
'EPAS1',
'SYSC',
'AOL',
'ZN471',
'PLOD2',
'KHDR1',
'HERC3',
'CPT2',
'TM1L1',
'KIRS.PE',
'MMP25',
'TM104',
'CD4',
'DDX41',
'MED17',
'ACL6A',
'CD165',
'DHB13',
'ZCH18',
'MAGB1',
'PR40B',
'CCDC6',
'UBP44',
'PLD2',
'SCG2',
'ETAA1',
'CDV3',
'RBGPR',
'CD5',
'PINX1',
'TSN13',
'ARRS',
'VN1R5',
'M4K4',
'DENR',
'RPN1',
'OR6K6',
'PAIP1',
'UBP48',
'ERP44',
'CCG6',
'UGDH',
'GL1AD',
'KINH',
'TGO1',
'CYB5B',
'RGS5',
'ENSA',
'H90B4',
'MYH6',
'UROK',
'RRP5',
'KANL1',
'CHSP1',
'MYO3B',
'DBNL',
'KALM',
'MAGB3',
'SPTB1',
'API5',
'CD207',
'CPPED',
'TBCE',
'CLC2B',
'CX6B1',
'FBRL',
'PSME1',
'AGRB1',
'OR1L1',
'HYDIN',
'TRPC1',
'EFNA1',
'CEP55',
'RTCA',
'BCL9L',
'ARP2',
'ODP2',
'COX41',
'CH086',
'GCDH',
'ZN582',
'TCF7',
'CCD89',
'PRTN3',
'CCD86',
'PBDC1',
'BLM',
'RSLBB',
'CFAI',
'ULK1',
'IGSF1',
'SMC1A',
'SHOC2',
'DJC11',
'POP7',
'HSDL2',
'SUV3',
'UGPA',
'PEBP1',
'SUMO2',
'RM17',
'ACLY',
'CTND2',
'ICAM5',
'NONO',
'LARP4',
'EWS',
'MED1',
'MCR',
'TCR.1',
'DNJC4',
'SRRM2',
'CD381',
'DDX21',
'CD274',
'RT09',
'GOLI4',
'ZC12C',
'NPSR1',
'TLK2',
'ZNF41',
'PNPH',
'PLPP7',
'GDIR2',
'LIMA1',
'HAOX2',
'GBRA1',
'ADAS',
'CNN1',
'CPSF2',
'ATLA3',
'ERIC1',
'PYGL',
'PELO',
'SMC2',
'CD3',
'TF65',
'AR13B',
'ARF4',
'SYNP2',
'ROBO1',
'PGRP4',
'CENPV',
'AKAP1',
'C1S',
'NACA2',
'SYIM',
'SAFB2',
'NOP53',
'XRCC6',
'ABLM3',
'TEKT1',
'FBW1B',
'NTPCR',
'NKD1',
'DDI2',
'ELAV1',
'CALM1',
'OR6Y1',
'BRI3B',
'OGT1',
'KAD3',
'SNX6',
'XBP1',
'CBPC4',
'PPP5',
'PLSL',
'ICAL',
'TCR.VA24.JA18',
'TDIF2',
'PCY1A',
'ARHGI',
'BBS2',
'CD49A',
'UBP42',
'UBE3A',
'TBB6',
'PP2AA',
'ALDH2',
'DEN4C',
'A4',
'UBP31',
'RTEL1',
'TTC22',
'PXDC1',
'VIP1',
'AGRA3',
'UBP11',
'ZBED4',
'PRDX6',
'IF172',
'ORC5',
'SF3A2',
'NNMT',
'GLR',
'PCKGM',
'NRG2',
'STAU1',
'BD1L1',
'CARL1',
'MUC21',
'TRIPB',
'CD054',
'ALKB5',
'CY24A',
'KCY',
'M3K6',
'SYNC',
'PCDH8',
'SCAF4',
'LELP1',
'NHP2',
'JUPI2',
'TRUB1',
'RU2A',
'CFA58',
'CERS2',
'RAP2B',
'PYAS1',
'SRSF9',
'SOX9',
'CD8B.2ST8.5H7',
'EFC11',
'TIAM1',
'ADCY1',
'CPT1A',
'ACOT9',
'FERM1',
'PSMD5',
'AATC',
'EMC2',
'GNAS2',
'ERB',
'ARHGB',
'SYYM',
'C1QBP',
'FANK1',
'PSB1',
'GDN',
'STT3A',
'LGSN',
'CD97',
'PSMG1',
'CC178',
'PUR4',
'CD158E1',
'MF2L2',
'PA24C',
'SCML1',
'TRPV5',
'ATRX',
'XRN2',
'CTNA2',
'DNJC9',
'NEUL',
'HNRL1',
'SMCA2',
'KRT85',
'CAVN2',
'MUC5A',
'DRC2',
'KCNH8',
'TDRD5',
'UTP11',
'VP13C',
'TM109',
'MAT2B',
'NU188',
'CNPY4',
'LAT2',
'PSMD9',
'CD324',
'SPAG7',
'BUD31',
'ATG3',
'ARFP2',
'COPG2',
'PEAK3',
'LARG1',
'CILK1',
'WDR44',
'CPSM',
...]
3. Expand and save
[23]:
pro_expanded = expand_protein_to_panel(pro, panel, id_col=None)
print('Expanded shape:', pro_expanded.shape)
pro_expanded.write_h5ad(OUT_H5AD, compression=COMPRESSION)
print('Saved to:', OUT_H5AD)
Expanded shape: (10546, 6427)
Saved to: ../../docs/tutorials/expanded_output/pro_expanded_6427.h5ad
../../scLinguist/data_loaders/data_loader.py:894: ImplicitModificationWarning: Trying to modify attribute `.var` of view, initializing view as actual.
pro.var["feature_id"] = pro.var_names
4. Quick check
[24]:
print('First 10 names:', list(pro_expanded.var_names[:10]))
print('Total vars:', pro_expanded.n_vars)
missing = [p for p in panel if p not in set(pro.var_names)]
print('Missing count from source (filled with zeros):', len(missing))
First 10 names: ['SP110', 'GTPBA', 'SNX2', 'FRG1', 'TT21A', 'RHG18', 'AR', 'DOCK1', 'RAB1A', 'MUC1.HMFG2']
Total vars: 6427
Missing count from source (filled with zeros): 6417