{ "cells": [ { "metadata": {}, "cell_type": "markdown", "source": "# Imputation Tutorial" }, { "metadata": {}, "cell_type": "markdown", "source": "This tutorial demonstrates how to use **finetuned** scLinguist model to impute protein expression from RNA data." }, { "metadata": {}, "cell_type": "markdown", "source": "Import necessary packages and define paths for checkpoints and save directory." }, { "cell_type": "code", "source": [ "from pathlib import Path\n", "from torch.utils.data import DataLoader\n", "import sys\n", "sys.path.append('../../')\n", "from scLinguist.data_loaders.data_loader import scMultiDataset\n", "from scLinguist.model.configuration_hyena import HyenaConfig\n", "from scLinguist.model.model import scTrans\n", "import importlib, sys\n", "sys.modules['model'] = importlib.import_module('scLinguist.model')\n", "\n", "ENCODER_CKPT = Path(\"../../pretrained_model/encoder.ckpt\")\n", "DECODER_CKPT = Path(\"../../pretrained_model/decoder.ckpt\")\n", "FINETUNE_CKPT = Path(\"../../pretrained_model/finetune.ckpt\")\n", "SAVE_DIR = Path(\"../../docs/tutorials/finetune_output\")\n", "SAVE_DIR.mkdir(exist_ok=True)" ], "metadata": { "collapsed": false, "pycharm": { "name": "#%%\n", "is_executing": true }, "ExecuteTime": { "end_time": "2025-07-22T08:38:22.958128Z", "start_time": "2025-07-22T08:38:19.270501Z" } }, "outputs": [], "execution_count": 1 }, { "metadata": {}, "cell_type": "markdown", "source": "Configure the model with the appropriate encoder and decoder checkpoints, and set the mode to \"RNA-protein\". The `HyenaConfig` class is used to define the model configuration parameters such as `d_model`, `emb_dim`, `max_seq_len`, `vocab_len`, and `n_layer`." }, { "metadata": { "ExecuteTime": { "end_time": "2025-07-22T08:38:39.468014Z", "start_time": "2025-07-22T08:38:32.800225Z" } }, "cell_type": "code", "source": [ "enc_cfg = HyenaConfig(\n", " d_model = 128,\n", " emb_dim = 5,\n", " max_seq_len = 19202,\n", " vocab_len = 19202,\n", " n_layer = 1,\n", " output_hidden_states=False,\n", ")\n", "dec_cfg = HyenaConfig(\n", " d_model = 128,\n", " emb_dim = 5,\n", " max_seq_len = 6427,\n", " vocab_len = 6427,\n", " n_layer = 1,\n", " output_hidden_states=False,\n", ")\n", "model = scTrans.load_from_checkpoint(checkpoint_path=FINETUNE_CKPT)\n", "model.encoder_ckpt_path = ENCODER_CKPT\n", "model.decoder_ckpt_path = DECODER_CKPT\n", "model.mode = \"RNA-protein\"" ], "outputs": [], "execution_count": 3 }, { "metadata": {}, "cell_type": "markdown", "source": [ "Put your proteins of interest in ../../docs/tutorials/protein_names.txt.\n", "\n", "**Important**: these proteins should be in ../../docs/tutorials/protein_index_map.csv (6427 proteins in total)." ] }, { "metadata": { "ExecuteTime": { "end_time": "2025-07-22T09:14:51.210513Z", "start_time": "2025-07-22T09:14:51.114312Z" } }, "cell_type": "code", "source": [ "import scanpy as sc\n", "import torch\n", "\n", "# only use 10 cells for example\n", "test_adata = sc.read_h5ad(\"../../data/test_sample_rna.h5ad\")[:10]\n", "rna_tensor = torch.tensor(test_adata.X.todense(), dtype=torch.float32).cuda()\n", "\n", "model.encoder_ckpt_path = ENCODER_CKPT\n", "model.decoder_ckpt_path = DECODER_CKPT\n", "model.mode = \"RNA-protein\"\n", "model.eval().cuda()\n", "\n", "with torch.no_grad():\n", " _, _, protein_pred = model(rna_tensor)\n", "\n", "# impute given proteins\n", "target_proteins = [line.strip() for line in open(\"../../docs/tutorials/protein_names.txt\")]\n", "\n", "import pandas as pd\n", "prot_map = pd.read_csv(\"../../docs/tutorials/protein_index_map.csv\")\n", "name_to_idx = dict(zip(prot_map[\"name\"], prot_map[\"index\"]))\n", "\n", "idx = [name_to_idx[p] for p in target_proteins if p in name_to_idx]\n", "\n", "pred_df = pd.DataFrame(\n", " protein_pred[:, idx].cpu().numpy(),\n", " columns = target_proteins,\n", " index = test_adata.obs_names,\n", ")\n", "pred_df.to_csv(SAVE_DIR/\"predicted_protein_expression.csv\")" ], "outputs": [], "execution_count": 5 } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 }