Protein Panel Expansion

This notebook shows how to transfer customize protein data into correct shape for scLinguist.

0. Imports

[17]:
import sys
sys.path.append('../../')
import numpy as np
import pandas as pd
import anndata as anndata
import scanpy as sc
from pathlib import Path
from scipy.sparse import csr_matrix
from scLinguist.data_loaders.data_loader import expand_protein_to_panel

1. Parameters

[18]:
PANEL_PATH = Path('../../docs/tutorials/protein_index_map.csv')
PROTEIN_PATH = Path('../../data/test_sample_adt.h5ad')
OUTPUT_DIR = Path('../../docs/tutorials/expanded_output')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_H5AD = OUTPUT_DIR / 'pro_expanded_6427.h5ad'
COMPRESSION = 'gzip'

2. Load data & target panel

[21]:
pro = sc.read_h5ad(PROTEIN_PATH)[:, :10] # take 10 proteins as example
panel = pd.read_csv(PANEL_PATH, index_col=None)
panel = panel.name.tolist()
print('Protein AnnData:', pro.shape)
print('Panel length:', len(panel))
Protein AnnData: (10546, 10)
Panel length: 6427
[22]:
panel
[22]:
['SP110',
 'GTPBA',
 'SNX2',
 'FRG1',
 'TT21A',
 'RHG18',
 'AR',
 'DOCK1',
 'RAB1A',
 'MUC1.HMFG2',
 'H2B1L',
 'RFC1',
 'TXTP',
 'MER34',
 'IL.3',
 'FXR2',
 'ARMD3',
 'ZDHC9',
 'KAPCA',
 'HH3',
 'ZBT21',
 'F149B',
 'PDE10',
 'PCD18',
 'PEBB',
 'PRI1',
 'NU214',
 'TNAP',
 'JHD2C',
 'MIC19',
 'CD213A2',
 'HYEP',
 'ZFR',
 'PUF60',
 'GPAT3',
 'DIDO1',
 'CD185',
 'AT12A',
 'TATD1',
 'ZN341',
 'RPB3',
 'APOC3',
 'TCR.VY9',
 'HS90B',
 'EMAL6',
 'WFS1',
 'RS9',
 'PGK2',
 'TYW2',
 'EF1B',
 'IMB1',
 'ACADM',
 'PAR6A',
 'CAP1',
 'CD357',
 'RGS7',
 'TCTP',
 'EXOS9',
 'RT34',
 'SQSTM',
 'TBA1A',
 'ENOPH',
 'BAP18',
 'AQR',
 'SMRC1',
 'FRIH',
 'HS105',
 'MORN3',
 'TIM23',
 'K1143',
 'CXB3',
 'CD72',
 'UBP25',
 'ECP',
 'EF2',
 'NSF1C',
 'CD196',
 'SPDE3',
 'CCR10',
 'PON2',
 'PURB',
 'PLCG2',
 'TCPD',
 'CPNE3',
 'TCRG',
 'MAOX',
 'PEPL',
 'NUF2',
 'RYR2',
 'LIMC1',
 'CHM2B',
 'LNX1',
 'MARCS',
 'ATP5E',
 'ZBED6',
 'FA47C',
 'UBE4B',
 'AAMDC',
 'CGAS',
 'AL1L1',
 'PDE1A',
 'VIR',
 'AKP8L',
 'APC',
 'PLXD1',
 'RNZ2',
 'MYPC3',
 'MRP5',
 'PROF2',
 'UBA3',
 'ZN599',
 'TCR.VB13.2',
 'TTC1',
 'NOP58',
 'RL27A',
 'INVO',
 'IF16',
 'SNX15',
 'ARL8B',
 'UNC80',
 'RMD1',
 'CD141',
 'CS044',
 'MXRA5',
 'S10A6',
 'LORF2',
 'NDKB',
 'CD56.1',
 'EPB42',
 'DIRC2',
 'COPZ1',
 'ECH1',
 'SYNE1',
 'MBB1A',
 'GTF2I',
 'PTN1',
 'PGTB2',
 'TRPM6',
 'CORO7',
 'DJC13',
 'O14K1',
 'PLIN3',
 'LDHA',
 'PP2BA',
 'COPE',
 'DPOLA',
 'FHL5',
 'SNX9',
 'F200B',
 'CD10',
 'MPV17',
 'STAT3',
 'PAR3L',
 'INT13',
 'RPB11',
 'CTL2',
 'PACN1',
 'GLYG2',
 'KMT2B',
 'LXN',
 'RFA1',
 'DPOG1',
 'O10J4',
 'SYVC',
 'OCAD1',
 'FANCM',
 'RPC2',
 'SYLC',
 'TRAP1',
 'KIF25',
 'ABCA5',
 'CP26B',
 'CCNK',
 'ZNF85',
 'MDM4',
 'NIPBL',
 'HECW1',
 'P63',
 'ADPRM',
 'CD275',
 'FAAA',
 'TEBP',
 'RN103',
 'MYO16',
 'SAM15',
 'SCRB2',
 'NCF2',
 'NU160',
 'CD88',
 'KNTC1',
 'BTK',
 'PPIA',
 'CD300',
 'CSTN3',
 'CADH4',
 'CO4A1',
 'DGKQ',
 'THYN1',
 'ALDR',
 'BMX',
 'PRPS1',
 'KMO',
 'CD85J',
 'PPR3A',
 'ZO3',
 'BAZ1A',
 'K1C19',
 'F120C',
 'RNF12',
 'VILI',
 'GYS2',
 'DHRS9',
 'PDE3A',
 'ENOG',
 'PSMD2',
 'MCFD2',
 'VISTA',
 'DCC1',
 'TPPC8',
 'KPRA',
 'ATD3A',
 'BRCA2',
 'P5F1B',
 'CAND1',
 'AT11B',
 'IGBP1',
 'ARLY',
 'GBP7',
 'SYNEM',
 'ZAN',
 'NUP98',
 'PDS5A',
 'IF4A1',
 'BRD8',
 'MPRIP',
 'ANR62',
 'MINY3',
 'ATAD1',
 'KGD4',
 'CYTSB',
 'TCR.VBETA13.1',
 'RL12',
 'CUL1',
 'PI42B',
 'PLCB3',
 'LMTK1',
 'TUT4',
 'SPSY',
 'SSRG',
 'MYOF',
 'PAAT',
 'GCP2',
 'ECI2',
 'ZMYM1',
 'GAB3',
 'DHB11',
 'SGT1',
 'TAM41',
 'IGG4.IH',
 'SCFD1',
 'GUAA',
 'NFIL3',
 'CHM1A',
 'DUS10',
 'PUS1',
 'PPRC1',
 'VEGFA',
 'MSH5',
 'DQX1',
 'BAG3',
 'SAP',
 'F184B',
 'KRIT1',
 'FXL19',
 'NUP50',
 'HS71L',
 'MTUS2',
 'GPTC4',
 'PGC1A.P',
 'CO8B',
 'E2AK2',
 'IMDH1',
 'PTK7',
 'STK24',
 'ILK',
 'ABCA7',
 'PDPN',
 'TXN4A',
 'HMR1',
 'RL26',
 'DIRA3',
 'BUB3',
 'ACTS',
 'ZN626',
 'ARP19',
 'ILKAP',
 'PPAC',
 'MYOM1',
 'MRP1',
 'SC2B2',
 'CNN3',
 'PAI2',
 'THIOM',
 'VPS41',
 'CDK6',
 'STK31',
 'CAV1',
 'SCPDL',
 'CHIKV',
 'COCA1',
 'SMRD1',
 'PRG4',
 'WASP',
 'PCSK9',
 'IL.21',
 'UGGG1',
 'KI18A',
 'ENO1',
 'ZN292',
 'MAGT1',
 'CD282',
 'ASPDH',
 'ARSG',
 'RT23',
 'CL079',
 'CX3CR1',
 'NDUBB',
 'FBP1L',
 'ECI1',
 'CLK1',
 'CRNL1',
 'ACSL3',
 'AFAD',
 'RB11A',
 'STRAB',
 'CC90B',
 '1433G',
 'CADHERIN',
 'RRAS2',
 'RTL1',
 'GADL1',
 'DSG4',
 'GRSF1',
 'TCRB',
 'COA3',
 'PRC2B',
 'NXPE2',
 'ATPD',
 'ECM29',
 'TDH',
 'CL004',
 'AJM1',
 'UQCC1',
 'LYRM2',
 'CDN2A',
 'SAHH',
 'RBM4',
 'RL18',
 'OVOL2',
 'TE2IP',
 'GYS1',
 'PPM1D',
 'ANTR2',
 'RHG31',
 'PSMD4',
 'CD85A',
 'LIN41',
 'RL7L',
 'CRK',
 'DHX16',
 'TBC31',
 'DAPK1',
 'EVPL',
 'CPNE1',
 'LAMA3',
 'LRRN1',
 'RL18A',
 'CAN14',
 'EFL1',
 'RL19',
 'RL22',
 'THTR',
 'ALG1',
 'CS',
 'ARP5L',
 'EMAL4',
 'TS1R1',
 'TNPO1',
 'COR1B',
 'MYLK',
 'ALG3',
 'CD35',
 'NRBP',
 'COPA',
 'CD229',
 'DCTP1',
 'RYBP',
 'TX1B3',
 'TRI60',
 'WASF3',
 'STIL',
 'IKZF1',
 'BCCIP',
 'ZMY15',
 'SNED1',
 'ALPK3',
 'CPNE6',
 'GCC2',
 'DDX6',
 'FLOT2',
 'LONP2',
 'CELR3',
 'DPP8',
 'LAP',
 'VPS4B',
 'SH3G1',
 'DNAS1',
 'MTND',
 'CD8',
 'CISD2',
 'CD90',
 'COX5A',
 'EIF2A',
 'PSMD1',
 'ADCK1',
 'YK004',
 'ATX2L',
 'DLDH',
 'NDST3',
 'CENPC',
 'STK19',
 'PARVB',
 'DDX49',
 'F157A',
 'OVGP1',
 'SYUA',
 'LMTK3',
 'CD85H',
 'TCPQ',
 'NOL11',
 'R10B1',
 'FAKD4',
 'APLP2',
 'NUCB1',
 'SEM3C',
 'GLYG',
 'TOM20',
 'CD257',
 'PABP5',
 'RM53',
 'HECD1',
 'CD133',
 'PPIL3',
 'KCC4',
 'AUTS2',
 'PCH2',
 'CD62L',
 'SNRK',
 'NPM3',
 'CD210',
 'CD11A',
 'RT24',
 'ARVC',
 'MBNL2',
 'PAQR1',
 'HCK',
 'ELMO3',
 'PGAP1',
 'RS18',
 'CD252',
 'CHFR',
 'NELL1',
 'PEX6',
 'TTLL8',
 'RGS8',
 'TCP4',
 'KEAP1',
 'KIN17',
 'SAE1',
 'GARL3',
 'PCD19',
 'ZNF81',
 'NFAT',
 'CLEC1B',
 'IL17F',
 'SPT5H',
 'CD7',
 'SMAP',
 'ADT4',
 'TTC17',
 'VATE1',
 'CD123',
 'ZN641',
 'AGRV1',
 'NH2L1',
 'VWC2L',
 'UHRF2',
 'HDC',
 'CD44',
 'TSN11',
 'UAP1',
 'NRDC',
 'FKBP4',
 'CFA54',
 'GDE',
 'TCR.GAMMA',
 'ARMT1',
 'HEAT3',
 'AT5EL',
 'SYBU',
 'CA2D4',
 'CD107B',
 'RS5',
 'CD178',
 'PP1G',
 'GCNA',
 'S1PROBE',
 'ZFAT',
 'PK3CA',
 'ATX10',
 'MED24',
 'ARP3',
 'PHF5A',
 'VAMP5',
 'SPICE',
 'PPR3D',
 'RXRG',
 'PSB5',
 'CIR1',
 'CNGB3',
 'TBA4A',
 'NID1',
 'CS2IP',
 'NEXN',
 'EST1',
 'H2A1D',
 'RAGP1',
 'IST1',
 'THOC4',
 'GALECTIN9',
 'GMPR1',
 'STRC',
 'TCPA',
 'SLIK3',
 'AKP13',
 'SET1B',
 'HEBP1',
 'SCAM2',
 'ESYT2',
 'SNR40',
 'FKBP9',
 'IFNA',
 'RT10',
 'CBPD',
 'TMA7',
 'HLA.E',
 'RTN3',
 'SG3A2',
 'INTEGRIN.A9B1',
 'CRML',
 'PRP16',
 'AP1M2',
 'PCX1',
 'SLIK4',
 'CD94',
 'CD276',
 'AK1C3',
 'EMAL2',
 'PEX5',
 'VINEX',
 'MEX3D',
 'SQOR',
 'RBM10',
 'FAT2',
 'PP2AB',
 'TBB2B',
 'RPR1B',
 'ZC3H3',
 'ANXA1',
 'SLIK1',
 'RS16',
 'CD16B',
 'FXR1',
 'TP53B',
 'BTNLA',
 'RPB7',
 'RALY',
 'PSB4',
 'CP250',
 'EIF3J',
 'CD365',
 'MTPN',
 'AT11A',
 'DCAF5',
 'ZN840',
 'TM52B',
 'ARFG2',
 'ZN596',
 'SHLB2',
 'PSMF1',
 'HNRPQ',
 'SYCC',
 'IDUA',
 'SYQ',
 'UN13D',
 'CCD70',
 'PRRT4',
 'CC124',
 'DREB',
 'PTH2R',
 'SAM14',
 'F162A',
 'ZN425',
 'PDE4A',
 'TOM22',
 'DX39B',
 'NUDT5',
 'CABP7',
 'S29P2',
 'CSN3',
 'NOL10',
 'MTMR6',
 'RLA1',
 'APC7',
 'PFD6',
 'NBEA',
 'NCOA5',
 'ACADS',
 'BRCA1',
 'HARB1',
 'CD019',
 'MATR3',
 'IQGA1',
 'PRS8',
 'CLD10',
 'CX7A2',
 'NUDC2',
 'P3C2B',
 'ARPC5',
 'GPCP1',
 'AUHM',
 'TRRAP',
 'TCR.Y.D',
 'S6',
 'LTOR5',
 'KRR1',
 'TCPR1',
 'WBP1',
 'FGD6',
 'BTF3',
 'HPLN1',
 'PFD4',
 'ZC11A',
 'WDR82',
 'CD215',
 'RLA0',
 'TBX2',
 'DDX56',
 'INSL3',
 'ZNT3',
 'SERP3',
 'METK2',
 'CTBL1',
 'POF1B',
 'GRAP2',
 'PLAK',
 'PESC',
 'HMMR',
 'CB078',
 'CRBG3',
 'OSGEP',
 'CD108',
 'H1X',
 'CD352',
 'RPGF5',
 '5MP2',
 'TFAP4',
 'ZN519',
 'CD45RB',
 'ZN608',
 'TENS1',
 'EIF3E',
 'IFNA2',
 'PRA33',
 'CF20D',
 'ATP5J',
 'LRRF1',
 'UBX2A',
 'UQCC2',
 'CA159',
 'MUCL3',
 'AKTS1',
 'RAB8B',
 'PLPL2',
 'VSXL2',
 'ABCAC',
 'DYST',
 'RNFT2',
 'UTP4',
 'HXC10',
 'CFA74',
 'CPIN1',
 'RASN',
 'FBSL',
 'PRP8',
 'CD182',
 'DNJC2',
 'TCR.VA7',
 'SH3B4',
 'RS8',
 'NDKA',
 'ESPNL',
 'MEG10',
 'SNX29',
 'RPB9',
 'UT14A',
 'UBF1',
 'FKBP5',
 'EPAS1',
 'SYSC',
 'AOL',
 'ZN471',
 'PLOD2',
 'KHDR1',
 'HERC3',
 'CPT2',
 'TM1L1',
 'KIRS.PE',
 'MMP25',
 'TM104',
 'CD4',
 'DDX41',
 'MED17',
 'ACL6A',
 'CD165',
 'DHB13',
 'ZCH18',
 'MAGB1',
 'PR40B',
 'CCDC6',
 'UBP44',
 'PLD2',
 'SCG2',
 'ETAA1',
 'CDV3',
 'RBGPR',
 'CD5',
 'PINX1',
 'TSN13',
 'ARRS',
 'VN1R5',
 'M4K4',
 'DENR',
 'RPN1',
 'OR6K6',
 'PAIP1',
 'UBP48',
 'ERP44',
 'CCG6',
 'UGDH',
 'GL1AD',
 'KINH',
 'TGO1',
 'CYB5B',
 'RGS5',
 'ENSA',
 'H90B4',
 'MYH6',
 'UROK',
 'RRP5',
 'KANL1',
 'CHSP1',
 'MYO3B',
 'DBNL',
 'KALM',
 'MAGB3',
 'SPTB1',
 'API5',
 'CD207',
 'CPPED',
 'TBCE',
 'CLC2B',
 'CX6B1',
 'FBRL',
 'PSME1',
 'AGRB1',
 'OR1L1',
 'HYDIN',
 'TRPC1',
 'EFNA1',
 'CEP55',
 'RTCA',
 'BCL9L',
 'ARP2',
 'ODP2',
 'COX41',
 'CH086',
 'GCDH',
 'ZN582',
 'TCF7',
 'CCD89',
 'PRTN3',
 'CCD86',
 'PBDC1',
 'BLM',
 'RSLBB',
 'CFAI',
 'ULK1',
 'IGSF1',
 'SMC1A',
 'SHOC2',
 'DJC11',
 'POP7',
 'HSDL2',
 'SUV3',
 'UGPA',
 'PEBP1',
 'SUMO2',
 'RM17',
 'ACLY',
 'CTND2',
 'ICAM5',
 'NONO',
 'LARP4',
 'EWS',
 'MED1',
 'MCR',
 'TCR.1',
 'DNJC4',
 'SRRM2',
 'CD381',
 'DDX21',
 'CD274',
 'RT09',
 'GOLI4',
 'ZC12C',
 'NPSR1',
 'TLK2',
 'ZNF41',
 'PNPH',
 'PLPP7',
 'GDIR2',
 'LIMA1',
 'HAOX2',
 'GBRA1',
 'ADAS',
 'CNN1',
 'CPSF2',
 'ATLA3',
 'ERIC1',
 'PYGL',
 'PELO',
 'SMC2',
 'CD3',
 'TF65',
 'AR13B',
 'ARF4',
 'SYNP2',
 'ROBO1',
 'PGRP4',
 'CENPV',
 'AKAP1',
 'C1S',
 'NACA2',
 'SYIM',
 'SAFB2',
 'NOP53',
 'XRCC6',
 'ABLM3',
 'TEKT1',
 'FBW1B',
 'NTPCR',
 'NKD1',
 'DDI2',
 'ELAV1',
 'CALM1',
 'OR6Y1',
 'BRI3B',
 'OGT1',
 'KAD3',
 'SNX6',
 'XBP1',
 'CBPC4',
 'PPP5',
 'PLSL',
 'ICAL',
 'TCR.VA24.JA18',
 'TDIF2',
 'PCY1A',
 'ARHGI',
 'BBS2',
 'CD49A',
 'UBP42',
 'UBE3A',
 'TBB6',
 'PP2AA',
 'ALDH2',
 'DEN4C',
 'A4',
 'UBP31',
 'RTEL1',
 'TTC22',
 'PXDC1',
 'VIP1',
 'AGRA3',
 'UBP11',
 'ZBED4',
 'PRDX6',
 'IF172',
 'ORC5',
 'SF3A2',
 'NNMT',
 'GLR',
 'PCKGM',
 'NRG2',
 'STAU1',
 'BD1L1',
 'CARL1',
 'MUC21',
 'TRIPB',
 'CD054',
 'ALKB5',
 'CY24A',
 'KCY',
 'M3K6',
 'SYNC',
 'PCDH8',
 'SCAF4',
 'LELP1',
 'NHP2',
 'JUPI2',
 'TRUB1',
 'RU2A',
 'CFA58',
 'CERS2',
 'RAP2B',
 'PYAS1',
 'SRSF9',
 'SOX9',
 'CD8B.2ST8.5H7',
 'EFC11',
 'TIAM1',
 'ADCY1',
 'CPT1A',
 'ACOT9',
 'FERM1',
 'PSMD5',
 'AATC',
 'EMC2',
 'GNAS2',
 'ERB',
 'ARHGB',
 'SYYM',
 'C1QBP',
 'FANK1',
 'PSB1',
 'GDN',
 'STT3A',
 'LGSN',
 'CD97',
 'PSMG1',
 'CC178',
 'PUR4',
 'CD158E1',
 'MF2L2',
 'PA24C',
 'SCML1',
 'TRPV5',
 'ATRX',
 'XRN2',
 'CTNA2',
 'DNJC9',
 'NEUL',
 'HNRL1',
 'SMCA2',
 'KRT85',
 'CAVN2',
 'MUC5A',
 'DRC2',
 'KCNH8',
 'TDRD5',
 'UTP11',
 'VP13C',
 'TM109',
 'MAT2B',
 'NU188',
 'CNPY4',
 'LAT2',
 'PSMD9',
 'CD324',
 'SPAG7',
 'BUD31',
 'ATG3',
 'ARFP2',
 'COPG2',
 'PEAK3',
 'LARG1',
 'CILK1',
 'WDR44',
 'CPSM',
 ...]

3. Expand and save

[23]:
pro_expanded = expand_protein_to_panel(pro, panel, id_col=None)
print('Expanded shape:', pro_expanded.shape)
pro_expanded.write_h5ad(OUT_H5AD, compression=COMPRESSION)
print('Saved to:', OUT_H5AD)
Expanded shape: (10546, 6427)
Saved to: ../../docs/tutorials/expanded_output/pro_expanded_6427.h5ad
../../scLinguist/data_loaders/data_loader.py:894: ImplicitModificationWarning: Trying to modify attribute `.var` of view, initializing view as actual.
  pro.var["feature_id"] = pro.var_names

4. Quick check

[24]:
print('First 10 names:', list(pro_expanded.var_names[:10]))
print('Total vars:', pro_expanded.n_vars)
missing = [p for p in panel if p not in set(pro.var_names)]
print('Missing count from source (filled with zeros):', len(missing))
First 10 names: ['SP110', 'GTPBA', 'SNX2', 'FRG1', 'TT21A', 'RHG18', 'AR', 'DOCK1', 'RAB1A', 'MUC1.HMFG2']
Total vars: 6427
Missing count from source (filled with zeros): 6417