{
 "cells": [
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "# Imputation Tutorial"
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "This tutorial demonstrates how to use **finetuned** scLinguist model to impute protein expression from RNA data."
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "Import necessary packages and define paths for checkpoints and save directory."
  },
  {
   "cell_type": "code",
   "source": [
    "from pathlib import Path\n",
    "from torch.utils.data import DataLoader\n",
    "import sys\n",
    "sys.path.append('../../')\n",
    "from scLinguist.data_loaders.data_loader import scMultiDataset\n",
    "from scLinguist.model.configuration_hyena import HyenaConfig\n",
    "from scLinguist.model.model import scTrans\n",
    "import importlib, sys\n",
    "sys.modules['model'] = importlib.import_module('scLinguist.model')\n",
    "\n",
    "ENCODER_CKPT = Path(\"../../pretrained_model/encoder.ckpt\")\n",
    "DECODER_CKPT = Path(\"../../pretrained_model/decoder.ckpt\")\n",
    "FINETUNE_CKPT = Path(\"../../pretrained_model/finetune.ckpt\")\n",
    "SAVE_DIR = Path(\"../../docs/tutorials/finetune_output\")\n",
    "SAVE_DIR.mkdir(exist_ok=True)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": true
    },
    "ExecuteTime": {
     "end_time": "2025-07-22T08:38:22.958128Z",
     "start_time": "2025-07-22T08:38:19.270501Z"
    }
   },
   "outputs": [],
   "execution_count": 1
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": "Configure the model with the appropriate encoder and decoder checkpoints, and set the mode to \"RNA-protein\". The `HyenaConfig` class is used to define the model configuration parameters such as `d_model`, `emb_dim`, `max_seq_len`, `vocab_len`, and `n_layer`."
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-07-22T08:38:39.468014Z",
     "start_time": "2025-07-22T08:38:32.800225Z"
    }
   },
   "cell_type": "code",
   "source": [
    "enc_cfg = HyenaConfig(\n",
    "    d_model        = 128,\n",
    "    emb_dim        = 5,\n",
    "    max_seq_len    = 19202,\n",
    "    vocab_len      = 19202,\n",
    "    n_layer        = 1,\n",
    "    output_hidden_states=False,\n",
    ")\n",
    "dec_cfg = HyenaConfig(\n",
    "    d_model        = 128,\n",
    "    emb_dim        = 5,\n",
    "    max_seq_len    = 6427,\n",
    "    vocab_len      = 6427,\n",
    "    n_layer        = 1,\n",
    "    output_hidden_states=False,\n",
    ")\n",
    "model = scTrans.load_from_checkpoint(checkpoint_path=FINETUNE_CKPT)\n",
    "model.encoder_ckpt_path = ENCODER_CKPT\n",
    "model.decoder_ckpt_path = DECODER_CKPT\n",
    "model.mode = \"RNA-protein\""
   ],
   "outputs": [],
   "execution_count": 3
  },
  {
   "metadata": {},
   "cell_type": "markdown",
   "source": [
    "Put your proteins of interest in ../../docs/tutorials/protein_names.txt.\n",
    "\n",
    "**Important**: these proteins should be in ../../docs/tutorials/protein_index_map.csv (6427 proteins in total)."
   ]
  },
  {
   "metadata": {
    "ExecuteTime": {
     "end_time": "2025-07-22T09:14:51.210513Z",
     "start_time": "2025-07-22T09:14:51.114312Z"
    }
   },
   "cell_type": "code",
   "source": [
    "import scanpy as sc\n",
    "import torch\n",
    "\n",
    "# only use 10 cells for example\n",
    "test_adata = sc.read_h5ad(\"../../data/test_sample_rna.h5ad\")[:10]\n",
    "rna_tensor = torch.tensor(test_adata.X.todense(), dtype=torch.float32).cuda()\n",
    "\n",
    "model.encoder_ckpt_path = ENCODER_CKPT\n",
    "model.decoder_ckpt_path = DECODER_CKPT\n",
    "model.mode = \"RNA-protein\"\n",
    "model.eval().cuda()\n",
    "\n",
    "with torch.no_grad():\n",
    "    _, _, protein_pred = model(rna_tensor)\n",
    "\n",
    "# impute given proteins\n",
    "target_proteins = [line.strip() for line in open(\"../../docs/tutorials/protein_names.txt\")]\n",
    "\n",
    "import pandas as pd\n",
    "prot_map = pd.read_csv(\"../../docs/tutorials/protein_index_map.csv\")\n",
    "name_to_idx = dict(zip(prot_map[\"name\"], prot_map[\"index\"]))\n",
    "\n",
    "idx = [name_to_idx[p] for p in target_proteins if p in name_to_idx]\n",
    "\n",
    "pred_df = pd.DataFrame(\n",
    "    protein_pred[:, idx].cpu().numpy(),\n",
    "    columns = target_proteins,\n",
    "    index   = test_adata.obs_names,\n",
    ")\n",
    "pred_df.to_csv(SAVE_DIR/\"predicted_protein_expression.csv\")"
   ],
   "outputs": [],
   "execution_count": 5
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}