Bechamrking Procedures for scRegualte¶

Part 1 (Data Preparation - SEED=42)¶


1-I. GoundGAN Synthetic Data¶

All the dataset in this section can be found on the GRouNdGAN website. Last accessed on: 2/6/2025. https://emad-combine-lab.github.io/GRouNdGAN/benchmarking.html


1-I-A. PBMC - Human¶

PBMC-ALL (Zheng et al., 2017) Reference dataset: Human peripheral blood mononuclear cell (PBMC Donor A) dataset from:

https://emad-combine-lab.github.io/GRouNdGAN/benchmarking.html

https://support.10xgenomics.com/single-cell-gene-expression/datasets/1.1.0/fresh_68k_pbmc_donor_a

Steps:¶

This workflow includes the following:

  1. QC filtering
  2. Normalization
  3. Sketch-based data splitting (5 fold)
In [1]:
import pandas as pd
import scanpy as sc
import geosketch
import numpy as np
In [2]:
read_path = './datasets/groundGan/PBMC-ALL_100K_cells.h5ad'
rna_data_all = sc.read_h5ad(read_path)
rna_data_all
Out[2]:
AnnData object with n_obs × n_vars = 100000 × 1000
In [3]:
print('Raw Data:')
print(f'rna_data_all.shape:, {rna_data_all.shape}')
rna_data_all.var_names_make_unique()
rna_data_all.obs_names_make_unique()
# mitochondrial genes, "MT-" for human, "Mt-" for mouse
rna_data_all.var["mt"] = rna_data_all.var_names.str.startswith("MT-")
# ribosomal genes
rna_data_all.var["ribo"] = rna_data_all.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes
rna_data_all.var["hb"] = rna_data_all.var_names.str.contains("^HB[^(P)]")
sc.pp.calculate_qc_metrics(rna_data_all, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True)
sc.pp.filter_cells(rna_data_all, min_genes=10)
sc.pp.filter_genes(rna_data_all, min_cells=3)
sc.pp.normalize_total(rna_data_all)
sc.pp.log1p(rna_data_all)
print('Final Data:')
print(f'rna_data_all.shape:, {rna_data_all.shape}')
Raw Data:
rna_data_all.shape:, (100000, 1000)
Final Data:
rna_data_all.shape:, (99998, 986)
In [4]:
import geosketch
import numpy as np
import scanpy as sc

# Load full dataset (rna_data_all) assumed to be already loaded
num_subsets = 5
desired_subset_size = rna_data_all.shape[0] // num_subsets  # 20K per subset
random_seed = 42

# Step 1: Cluster the Data
sc.pp.neighbors(rna_data_all, use_rep="X")
sc.tl.leiden(rna_data_all, resolution=1.0, flavor="igraph", n_iterations=2)
rna_data_all.obs["pseudo_cluster"] = rna_data_all.obs["leiden"]

pseudo_clusters = rna_data_all.obs["pseudo_cluster"].unique()
split_datasets = [[] for _ in range(num_subsets)]

# Step 2: Proportional Stratified Sampling within Each Pseudo-Cluster
total_cells = rna_data_all.n_obs

for cluster in pseudo_clusters:
    # Get indices of cells in the current cluster
    cluster_indices = np.where(rna_data_all.obs["pseudo_cluster"] == cluster)[0]
    n_cluster = len(cluster_indices)
    
    # Compute the target number for this cluster, per subset, in proportion to its size
    # (rounding may be applied; you can adjust rounding as needed)
    target_cells_cluster = int((n_cluster / total_cells) * desired_subset_size)
    
    # To ensure you don't sample more than available, target should be at most the number of cells in that cluster.
    target_cells_cluster = min(target_cells_cluster, n_cluster)
    
    # If the cluster is too small to meet the target, you might choose to take all cells or a subset.
    # Here, we'll use all cells if n_cluster is less than the target.
    if target_cells_cluster == 0:
        continue

    for i in range(num_subsets):
        # Apply geosketch with a seed offset for reproducibility
        sketch_indices = geosketch.gs(rna_data_all[cluster_indices].X,
                                      target_cells_cluster,
                                      replace=False,
                                      seed=random_seed + i)
        # Map back to global indices
        selected_indices = cluster_indices[sketch_indices]
        split_datasets[i].extend(selected_indices)

# Step 3: Convert selected indices into separate AnnData objects
for i in range(num_subsets):
    subset = rna_data_all[split_datasets[i], :].copy()
    subset.write_h5ad(f"./datasets/groundGan/PBMC-ALL_100K_cells_subset_{i+1}.h5ad")
    
print("Proportional stratified sketching applied. Datasets successfully saved.")
Proportional stratified sketching applied. Datasets successfully saved.
In [5]:
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt

# Define file paths
data_dir = "./datasets/groundGan/"
dataset_paths = [f"{data_dir}PBMC-ALL_100K_cells_subset_{i+1}.h5ad" for i in range(5)]

# Load datasets and compute mean expression
mean_expressions = []
for path in dataset_paths:
    adata = sc.read_h5ad(path)
    mean_expressions.append(np.array(adata.X.mean(axis=0)).flatten())

# Convert to DataFrame
mean_expr_df = pd.DataFrame(np.vstack(mean_expressions).T)

# Compute Pearson correlation between datasets
correlation_matrix = mean_expr_df.corr(method="pearson")

# Plot heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f",
            xticklabels=[f"Subset {i+1}" for i in range(5)],
            yticklabels=[f"Subset {i+1}" for i in range(5)])
plt.title("Correlation Between Dataset Subsets")
plt.show()
No description has been provided for this image
In [6]:
import scanpy as sc
import matplotlib.pyplot as plt

# Define file paths
data_dir = "./datasets/groundGan/"
dataset_paths = [f"{data_dir}PBMC-ALL_100K_cells_subset_{i+1}.h5ad" for i in range(5)]

# Set up the figure for multiple UMAPs
fig, axes = plt.subplots(1, 5, figsize=(20, 4))

# Process each dataset
for i, path in enumerate(dataset_paths):
    adata = sc.read_h5ad(path)

    # Compute UMAP if not already computed
    if 'X_umap' not in adata.obsm:
        sc.pp.neighbors(adata, use_rep="X")  # Compute neighbors using gene expression
        sc.tl.umap(adata)  # Compute UMAP

    # Plot UMAP
    sc.pl.umap(adata, ax=axes[i], show=False, title=f"Subset {i+1}")

plt.tight_layout()
plt.show()
No description has been provided for this image

1-I-B. Tumor-ALL (Han et al., 2022) - Human¶

Reference dataset: Malignant cells as well as cells in the tumor microenvironment (called Tumor-All dataset here) from 20 fresh core needle biopsies of follicular lymphoma patients

Reference dataset: Human peripheral blood mononuclear cell (PBMC Donor A) dataset from

https://emad-combine-lab.github.io/GRouNdGAN/benchmarking.html

https://cellxgene.cziscience.com/collections/968834a0-1895-40df-8720-666029b3bbac

In [2]:
rna_data_all = sc.read_h5ad('./datasets/groundGan/Tumor-ALL_100K_cells.h5ad')
rna_data_all
Out[2]:
AnnData object with n_obs × n_vars = 100000 × 1000
In [3]:
print('Raw Data:')
print(f'rna_data_all.shape:, {rna_data_all.shape}')
rna_data_all.var_names_make_unique()
rna_data_all.obs_names_make_unique()
# mitochondrial genes, "MT-" for human, "Mt-" for mouse
rna_data_all.var["mt"] = rna_data_all.var_names.str.startswith("MT-")
# ribosomal genes
rna_data_all.var["ribo"] = rna_data_all.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes
rna_data_all.var["hb"] = rna_data_all.var_names.str.contains("^HB[^(P)]")
sc.pp.calculate_qc_metrics(rna_data_all, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True)
sc.pp.filter_cells(rna_data_all, min_genes=10)
sc.pp.filter_genes(rna_data_all, min_cells=3)
sc.pp.normalize_total(rna_data_all)
sc.pp.log1p(rna_data_all)
print('Final Data:')
print(f'rna_data_all.shape:, {rna_data_all.shape}')
Raw Data:
rna_data_all.shape:, (100000, 1000)
Final Data:
rna_data_all.shape:, (100000, 836)
In [4]:
import geosketch
import numpy as np
import scanpy as sc

# Load full dataset (rna_data_all)
num_subsets = 5
desired_subset_size = rna_data_all.shape[0] // num_subsets  # 20K per subset
random_seed = 42

# Step 1: Cluster the Data
sc.pp.neighbors(rna_data_all, use_rep="X")
sc.tl.leiden(rna_data_all, resolution=1.0, flavor="igraph", n_iterations=2)
rna_data_all.obs["pseudo_cluster"] = rna_data_all.obs["leiden"]

pseudo_clusters = rna_data_all.obs["pseudo_cluster"].unique()
split_datasets = [[] for _ in range(num_subsets)]

# Step 2: Proportional Stratified Sampling within Each Pseudo-Cluster
total_cells = rna_data_all.n_obs

for cluster in pseudo_clusters:
    # Get indices of cells in the current cluster
    cluster_indices = np.where(rna_data_all.obs["pseudo_cluster"] == cluster)[0]
    n_cluster = len(cluster_indices)
    
    # Compute the target number for this cluster, per subset, in proportion to its size
    # (rounding may be applied; you can adjust rounding as needed)
    target_cells_cluster = int((n_cluster / total_cells) * desired_subset_size)
    
    # To ensure you don't sample more than available, target should be at most the number of cells in that cluster.
    target_cells_cluster = min(target_cells_cluster, n_cluster)
    
    # If the cluster is too small to meet the target, you might choose to take all cells or a subset.
    # Here, we'll use all cells if n_cluster is less than the target.
    if target_cells_cluster == 0:
        continue

    for i in range(num_subsets):
        # Apply geosketch with a seed offset for reproducibility
        sketch_indices = geosketch.gs(rna_data_all[cluster_indices].X,
                                      target_cells_cluster,
                                      replace=False,
                                      seed=random_seed + i)
        # Map back to global indices
        selected_indices = cluster_indices[sketch_indices]
        split_datasets[i].extend(selected_indices)

# 📝 Step 3: Convert selected indices into separate AnnData objects
for i in range(num_subsets):
    subset = rna_data_all[split_datasets[i], :].copy()
    subset.write_h5ad(f"./datasets/groundGan/Tumor-ALL_100K_cells_subset_{i+1}.h5ad")

print("Stratified sketching using pseudo-clusters applied. Datasets successfully saved.")
Stratified sketching using pseudo-clusters applied. Datasets successfully saved.
In [5]:
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt

# Define file paths
data_dir = "./datasets/groundGan/"
dataset_paths = [f"{data_dir}Tumor-ALL_100K_cells_subset_{i+1}.h5ad" for i in range(5)]

# Load datasets and compute mean expression
mean_expressions = []
for path in dataset_paths:
    adata = sc.read_h5ad(path)
    mean_expressions.append(np.array(adata.X.mean(axis=0)).flatten())

# Convert to DataFrame
mean_expr_df = pd.DataFrame(np.vstack(mean_expressions).T)

# Compute Pearson correlation between datasets
correlation_matrix = mean_expr_df.corr(method="pearson")

# Plot heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f",
            xticklabels=[f"Subset {i+1}" for i in range(5)],
            yticklabels=[f"Subset {i+1}" for i in range(5)])
plt.title("Correlation Between Dataset Subsets")
plt.show()
No description has been provided for this image
In [2]:
import scanpy as sc
import matplotlib.pyplot as plt

# Define file paths
data_dir = "./datasets/groundGan/"
dataset_paths = [f"{data_dir}Tumor-ALL_100K_cells_subset_{i+1}.h5ad" for i in range(5)]

# Set up the figure for multiple UMAPs
fig, axes = plt.subplots(1, 5, figsize=(20, 4))

# Process each dataset
for i, path in enumerate(dataset_paths):
    adata = sc.read_h5ad(path)

    # Compute UMAP if not already computed
    if 'X_umap' not in adata.obsm:
        sc.pp.neighbors(adata, use_rep="X")  # Compute neighbors using gene expression
        sc.tl.umap(adata)  # Compute UMAP

    # Plot UMAP
    sc.pl.umap(adata, ax=axes[i], show=False, title=f"Subset {i+1}")

plt.tight_layout()
plt.show()
No description has been provided for this image

1-I-C. Dahlin (Dahlin et al., 2018) - Mouse¶

Reference dataset: Hematopoietic dataset corresponding to the scRNA-seq (10x Genomics) profiles of mouse bone marrow hematopoietic stem and progenitor cells (HSPCs) differentiating towards different lineages from GEO (accession number: GSE107727).

https://nextcloud.computecanada.ca/index.php/s/WXLKasKsjrGaEAR/download

https://emad-combine-lab.github.io/GRouNdGAN/benchmarking.html

In [10]:
rna_data_all = sc.read_h5ad('./datasets/groundGan/Dahlin_100K_cells.h5ad')
rna_data_all
Out[10]:
AnnData object with n_obs × n_vars = 100000 × 1000
In [11]:
print('Raw Data:')
print(f'rna_data_all.shape:, {rna_data_all.shape}')
rna_data_all.var_names_make_unique()
rna_data_all.obs_names_make_unique()
# mitochondrial genes, "MT-" for human, "Mt-" for mouse
rna_data_all.var["mt"] = rna_data_all.var_names.str.startswith("Mt-")
# ribosomal genes
rna_data_all.var["ribo"] = rna_data_all.var_names.str.startswith(("Rps", "Rpl"))
# hemoglobin genes
rna_data_all.var["hb"] = rna_data_all.var_names.str.contains("^Hb[^p]")
sc.pp.calculate_qc_metrics(rna_data_all, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True)
sc.pp.filter_cells(rna_data_all, min_genes=10)
sc.pp.filter_genes(rna_data_all, min_cells=3)
sc.pp.normalize_total(rna_data_all)
sc.pp.log1p(rna_data_all)
print('Final Data:')
print(f'rna_data_all.shape:, {rna_data_all.shape}')
Raw Data:
rna_data_all.shape:, (100000, 1000)
Final Data:
rna_data_all.shape:, (100000, 971)
In [12]:
import geosketch
import numpy as np
import scanpy as sc

# Load full dataset (rna_data_all)
num_subsets = 5
desired_subset_size = rna_data_all.shape[0] // num_subsets  # 20K per subset
random_seed = 42

# Step 1: Cluster the Data
sc.pp.neighbors(rna_data_all, use_rep="X")
sc.tl.leiden(rna_data_all, resolution=1.0, flavor="igraph", n_iterations=2)
rna_data_all.obs["pseudo_cluster"] = rna_data_all.obs["leiden"]

pseudo_clusters = rna_data_all.obs["pseudo_cluster"].unique()
split_datasets = [[] for _ in range(num_subsets)]

# Step 2: Proportional Stratified Sampling within Each Pseudo-Cluster
total_cells = rna_data_all.n_obs

for cluster in pseudo_clusters:
    # Get indices of cells in the current cluster
    cluster_indices = np.where(rna_data_all.obs["pseudo_cluster"] == cluster)[0]
    n_cluster = len(cluster_indices)
    
    # Compute the target number for this cluster, per subset, in proportion to its size
    # (rounding may be applied; you can adjust rounding as needed)
    target_cells_cluster = int((n_cluster / total_cells) * desired_subset_size)
    
    # To ensure you don't sample more than available, target should be at most the number of cells in that cluster.
    target_cells_cluster = min(target_cells_cluster, n_cluster)
    
    # If the cluster is too small to meet the target, you might choose to take all cells or a subset.
    # Here, we'll use all cells if n_cluster is less than the target.
    if target_cells_cluster == 0:
        continue

    for i in range(num_subsets):
        # Apply geosketch with a seed offset for reproducibility
        sketch_indices = geosketch.gs(rna_data_all[cluster_indices].X,
                                      target_cells_cluster,
                                      replace=False,
                                      seed=random_seed + i)
        # Map back to global indices
        selected_indices = cluster_indices[sketch_indices]
        split_datasets[i].extend(selected_indices)

# 📝 Step 3: Convert selected indices into separate AnnData objects
for i in range(num_subsets):
    subset = rna_data_all[split_datasets[i], :].copy()
    # Save subset
    subset.write_h5ad(f"./datasets/groundGan/Dahlin_cells_subset_{i+1}.h5ad")

print("Datasets successfully split into disjoint, representative subsets using geosketch.")
Datasets successfully split into disjoint, representative subsets using geosketch.
In [13]:
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt

# Define file paths
data_dir = "./datasets/groundGan/"
dataset_paths = [f"{data_dir}Dahlin_cells_subset_{i+1}.h5ad" for i in range(5)]

# Load datasets and compute mean expression
mean_expressions = []
for path in dataset_paths:
    adata = sc.read_h5ad(path)
    mean_expressions.append(np.array(adata.X.mean(axis=0)).flatten())

# Convert to DataFrame
mean_expr_df = pd.DataFrame(np.vstack(mean_expressions).T)

# Compute Pearson correlation between datasets
correlation_matrix = mean_expr_df.corr(method="pearson")

# Plot heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f",
            xticklabels=[f"Subset {i+1}" for i in range(5)],
            yticklabels=[f"Subset {i+1}" for i in range(5)])
plt.title("Correlation Between Dataset Subsets")
plt.show()
No description has been provided for this image
In [3]:
import scanpy as sc
import matplotlib.pyplot as plt

# Define file paths
data_dir = "./datasets/groundGan/"
dataset_paths = [f"{data_dir}Dahlin_cells_subset_{i+1}.h5ad" for i in range(5)]

# Set up the figure for multiple UMAPs
fig, axes = plt.subplots(1, 5, figsize=(20, 4))

# Process each dataset
for i, path in enumerate(dataset_paths):
    adata = sc.read_h5ad(path)

    # Compute UMAP if not already computed
    if 'X_umap' not in adata.obsm:
        sc.pp.neighbors(adata, use_rep="X")  # Compute neighbors using gene expression
        sc.tl.umap(adata)  # Compute UMAP

    # Plot UMAP
    sc.pl.umap(adata, ax=axes[i], show=False, title=f"Subset {i+1}")

plt.tight_layout()
plt.show()
No description has been provided for this image

1-II. Perturb-seq Data¶

1-II-A Dixit et al 2016¶

In [5]:
rna_data_all = sc.read('./datasets/Dixit_2016.h5ad')
sc.pp.filter_cells(rna_data_all, min_genes=10)
sc.pp.filter_genes(rna_data_all, min_cells=3)
sc.pp.subsample(rna_data_all, fraction=1, random_state=42)
rna_data_all
Out[5]:
AnnData object with n_obs × n_vars = 99722 × 18531
    obs: 'cluster', 'guide', 'INTERGENIC1144056', 'INTERGENIC1216445', 'INTERGENIC216151', 'INTERGENIC393453', 'sgCREB1_2', 'sgCREB1_4', 'sgCREB1_5', 'sgE2F4_6', 'sgE2F4_7', 'sgEGR1_2', 'sgEGR1_3', 'sgEGR1_4', 'sgELF1_1', 'sgELF1_2', 'sgELF1_4', 'sgELF1_5', 'sgELK1_1', 'sgELK1_6', 'sgELK1_7', 'sgETS1_3', 'sgETS1_5', 'sgGABPA_1', 'sgGABPA_9', 'sgIRF1_2', 'sgIRF1_3', 'sgNR2C2_2', 'sgNR2C2_3', 'sgNR2C2_5', 'sgYY1_10', 'sgYY1_3', 'batch', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'perturbation_name', 'perturbation_type', 'perturbation_value', 'perturbation_unit'
    var: 'index', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'doi', 'hvg', 'leiden', 'neighbors', 'pca', 'preprocessing_nb_link', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'
In [9]:
from IPython.display import display
# Initialize list to hold info
perturbation_info = []

# Loop through perturbations
for pert in perturbations.unique():
    if pert == 'control':
        pert_type = 'control'
        tf_count = 0
        same_tf = 'N/A'
    else:
        tfs = pert.split('+')
        tf_count = len(tfs)
        same_tf = 'same' if len(set(tfs)) == 1 else 'mixed'
        
        if tf_count == 1:
            pert_type = 'single'
        elif tf_count == 2:
            pert_type = 'double'
        elif tf_count == 3:
            pert_type = 'triple'
        else:
            pert_type = f'{tf_count}-ple'

    # Count number of cells for each perturbation
    cell_count = (perturbations == pert).sum()

    # Append results
    perturbation_info.append({
        'Perturbation': pert,
        'Type': pert_type,
        'TF_Count': tf_count,
        'Same_or_Mixed': same_tf,
        'Cell_Count': cell_count
    })

# Create DataFrame
perturb_df = pd.DataFrame(perturbation_info)

# Sort for better readability
perturb_df = perturb_df.sort_values(by=['Type', 'Same_or_Mixed', 'Cell_Count'], ascending=[True, True, False])

# Display the DataFrame in Jupyter Notebook
display(perturb_df)


# Filter for 'same' TF knockouts only
same_tf_df = perturb_df[perturb_df['Same_or_Mixed'] == 'same']
display(same_tf_df)
Perturbation Type TF_Count Same_or_Mixed Cell_Count
386 INTERGENIC1216445+E2F4+ELK1+ETS1 4-ple 4 mixed 2
512 EGR1+ELF1+ELK1+NR2C2 4-ple 4 mixed 2
524 CREB1+ELF1+ETS1+YY1 4-ple 4 mixed 2
140 E2F4+EGR1+NR2C2+YY1 4-ple 4 mixed 1
231 INTERGENIC393453+E2F4+ELK1+NR2C2 4-ple 4 mixed 1
... ... ... ... ... ...
639 ELF1+NR2C2+YY10 triple 3 mixed 1
640 INTERGENIC1144056+ELK1+YY10 triple 3 mixed 1
470 EGR1+EGR1+EGR1 triple 3 same 1
492 ELF1+ELF1+ELF1 triple 3 same 1
581 CREB1+CREB1+CREB1 triple 3 same 1

641 rows × 5 columns

Perturbation Type TF_Count Same_or_Mixed Cell_Count
39 ELF1+ELF1 double 2 same 219
108 ELK1+ELK1 double 2 same 82
139 CREB1+CREB1 double 2 same 79
235 EGR1+EGR1 double 2 same 53
109 NR2C2+NR2C2 double 2 same 45
151 GABPA+GABPA double 2 same 30
43 ETS1+ETS1 double 2 same 28
144 E2F4+E2F4 double 2 same 18
344 IRF1+IRF1 double 2 same 7
0 ELF1 single 1 same 11676
17 CREB1 single 1 same 7587
8 ELK1 single 1 same 7366
16 INTERGENIC393453 single 1 same 7177
14 EGR1 single 1 same 6181
15 GABPA single 1 same 5462
7 NR2C2 single 1 same 5030
4 ETS1 single 1 same 4815
10 E2F4 single 1 same 4516
9 IRF1 single 1 same 3837
18 INTERGENIC1144056 single 1 same 3518
22 INTERGENIC216151 single 1 same 3518
12 INTERGENIC1216445 single 1 same 2758
26 YY1 single 1 same 2499
3 YY10 single 1 same 2092
470 EGR1+EGR1+EGR1 triple 3 same 1
492 ELF1+ELF1+ELF1 triple 3 same 1
581 CREB1+CREB1+CREB1 triple 3 same 1
In [10]:
# Filter for double-same knockouts and control
double_same_or_control = perturb_df[
    ((perturb_df['Type'] == 'double') & (perturb_df['Same_or_Mixed'] == 'same')) | 
    (perturb_df['Type'] == 'control')
]['Perturbation']

# Subset the AnnData object
double_same_rna_data = rna_data_all[rna_data_all.obs['perturbation_name'].isin(double_same_or_control)].copy()

# Check the unique perturbations in the subset
print(double_same_rna_data.obs['perturbation_name'].unique())
['control', 'ELF1+ELF1', 'ETS1+ETS1', 'ELK1+ELK1', 'NR2C2+NR2C2', 'CREB1+CREB1', 'E2F4+E2F4', 'GABPA+GABPA', 'EGR1+EGR1', 'IRF1+IRF1']
Categories (10, object): ['CREB1+CREB1', 'E2F4+E2F4', 'EGR1+EGR1', 'ELF1+ELF1', ..., 'GABPA+GABPA', 'IRF1+IRF1', 'NR2C2+NR2C2', 'control']
In [11]:
double_same_rna_data.write_h5ad('Dixit_2016_clean.h5ad')

1-III. Benchmark Data for Clustering¶

1-III-A. Preprocessing for benchmarking¶

Data consisting in this section:¶

  1. Human PBMC 3K scRNA
  2. Mouse Tabula Muris Lung scRNA
  3. Mouse Tabula Muris Heart scRNA
  4. Mouse Tabula Muris Brain scRNA
In [36]:
# Define directories
data_dir = "./"
output_dir = "./Cluster_Evaluation_Data/"

# Define benchmark datasets
benchmark_datasets = {
    "pbmc": {"source": "builtin"}, 
    "lung": {"path": f"{data_dir}lung.h5ad"}, 
    "heart": {"path": f"{data_dir}heart.h5ad"},
    "brain": {"path": f"{data_dir}brain.h5ad"},
}
In [38]:
import os
def preprocess_rna_data(rna_data):
    # Apply basic shuffling and filtering
    sc.pp.filter_cells(rna_data, min_genes=10)
    sc.pp.filter_genes(rna_data, min_cells=3)
    
    # Assign ground truth labels
    if "cell_ontology_class" in rna_data.obs:
        rna_data.obs["truth"] = rna_data.obs["cell_ontology_class"].copy()
        sc.pp.normalize_total(rna_data, target_sum=1e6)
        sc.pp.log1p(rna_data)
    elif "louvain" in rna_data.obs:
        rna_data.obs["truth"] = rna_data.obs["louvain"]
    return rna_data

# Training loop
benchmark_results = {}

for dataset_name, dataset_info in benchmark_datasets.items():
    print(f"\n🔹 Preprocessing {dataset_name}...")

    # Load RNA data
    if dataset_info.get("source") == "builtin":
        rna_data = sc.datasets.pbmc3k_processed()
        raw_X = rna_data.raw.X.copy()
        obs = rna_data.obs.copy()
        var = rna_data.raw.var.copy()
        rna_data = sc.AnnData(X=raw_X, obs=obs, var=var)
        rna_data = preprocess_rna_data(rna_data)
    else:
        rna_data = sc.read_h5ad(dataset_info["path"])
        rna_data.X = rna_data.raw.X.copy()
        rna_data._raw._var.rename(columns={'_index': 'features'}, inplace=True)
        rna_data = preprocess_rna_data(rna_data)

    # Save processed dataset using `os.path.join()`
    save_path = os.path.join(output_dir, f"{dataset_name}.h5ad")
    rna_data.write_h5ad(save_path)
    
    # Print success message
    print(f"✅ Saved {dataset_name} to {save_path}")
🔹 Preprocessing pbmc...
✅ Saved pbmc to /home/mehrdad/research/scRegulate/Cluster_Evaluation_Data/pbmc.h5ad

🔹 Preprocessing lung...
✅ Saved lung to /home/mehrdad/research/scRegulate/Cluster_Evaluation_Data/lung.h5ad

🔹 Preprocessing heart...
✅ Saved heart to /home/mehrdad/research/scRegulate/Cluster_Evaluation_Data/heart.h5ad

🔹 Preprocessing brain...
✅ Saved brain to /home/mehrdad/research/scRegulate/Cluster_Evaluation_Data/brain.h5ad

1-III-B. Simulating 10% and 30% Dropout Effect¶

Data consisting in this section:¶

  1. Human PBMC 3K scRNA
  2. Mouse Tabula Muris Lung scRNA
  3. Mouse Tabula Muris Heart scRNA
  4. Mouse Tabula Muris Brain scRNA
In [4]:
import numpy as np
import scipy.sparse as sp
import scanpy as sc
import os
import time

# =============================================================================
# Noise Injection Functions
# =============================================================================

def add_gaussian_noise(adata, noise_level=0.05, seed=42):
    """
    Injects Gaussian noise into a sparse AnnData object.

    Parameters:
    - adata: AnnData object (scRNA-seq data)
    - noise_level: Percentage of noise to add (e.g., 0.05 for 5% noise)
    - seed: Fixed random seed for reproducibility

    Returns:
    - Noisy AnnData object (new copy) with sparse format preserved
    """
    np.random.seed(seed)
    noisy_adata = adata.copy()

    # Convert sparse to dense for processing
    data_dense = noisy_adata.X.toarray() if sp.issparse(noisy_adata.X) else noisy_adata.X

    # Compute standard deviation of each gene (column-wise)
    std_devs = np.std(data_dense, axis=0)

    # Generate Gaussian noise (scaled by gene-specific std)
    noise = np.random.normal(loc=0, scale=noise_level * std_devs, size=data_dense.shape)

    # Add noise and clip negative values to zero
    data_noisy = np.clip(data_dense + noise, 0, None)

    # Convert back to sparse matrix to save memory
    noisy_adata.X = sp.csr_matrix(data_noisy)

    return noisy_adata

def add_dropout_noise(adata, dropout_rate=0.2, seed=42):
    """
    Simulates dropout noise by randomly setting a fraction of values to zero.

    Parameters:
    - adata: AnnData object
    - dropout_rate: Fraction of values to drop (0.1 = 10% dropout)
    - seed: Random seed for reproducibility

    Returns:
    - AnnData object with dropout noise
    """
    np.random.seed(seed)
    noisy_adata = adata.copy()

    # Convert sparse matrix to dense for processing
    data_dense = noisy_adata.X.toarray() if sp.issparse(noisy_adata.X) else noisy_adata.X

    # Generate a dropout mask (1 = keep, 0 = drop)
    dropout_mask = np.random.rand(*data_dense.shape) > dropout_rate

    # Apply dropout noise
    data_noisy = data_dense * dropout_mask

    # Convert back to sparse matrix
    noisy_adata.X = sp.csr_matrix(data_noisy)

    return noisy_adata

# =============================================================================
# Dataset Processing
# =============================================================================

# Define directories
data_dir = "./Cluster_Evaluation_Data/"
output_dir = "./Cluster_Evaluation_Data/"

# Define benchmark datasets
benchmark_datasets = {
    "pbmc": {"path": f"{data_dir}pbmc.h5ad"},
    "lung": {"path": f"{data_dir}lung.h5ad"},
    "heart": {"path": f"{data_dir}heart.h5ad"},
    "brain": {"path": f"{data_dir}brain.h5ad"},
}

# Dropout levels to test
dropout_levels = [0.1, 0.3]

for dataset_name, dataset_info in benchmark_datasets.items():
    print(f"\n📌 Processing dataset: {dataset_name}")

    # Load dataset
    original_data = sc.read_h5ad(dataset_info["path"])

    for dropout_level in dropout_levels:
        print(f"  🔹 Applying {int(dropout_level * 100)}% dropout...")

        start_time = time.time()

        # Apply dropout noise
        noisy_data = add_dropout_noise(original_data, dropout_rate=dropout_level, seed=42)

        # Define output filename
        output_filename = f"{dataset_name}_dropout_{int(dropout_level * 100)}.h5ad"
        output_path = os.path.join(output_dir, output_filename)

        # Save noisy dataset
        noisy_data.write_h5ad(output_path)

        elapsed_time = time.time() - start_time
        print(f"  ✅ Saved {output_filename} ({elapsed_time:.2f} sec)")

print("\n🎯 All noisy datasets successfully created and saved!")
📌 Processing dataset: pbmc
  🔹 Applying 10% dropout...
  ✅ Saved pbmc_dropout_10.h5ad (0.40 sec)
  🔹 Applying 30% dropout...
  ✅ Saved pbmc_dropout_30.h5ad (0.37 sec)

📌 Processing dataset: lung
  🔹 Applying 10% dropout...
  ✅ Saved lung_dropout_10.h5ad (1.27 sec)
  🔹 Applying 30% dropout...
  ✅ Saved lung_dropout_30.h5ad (1.22 sec)

📌 Processing dataset: heart
  🔹 Applying 10% dropout...
  ✅ Saved heart_dropout_10.h5ad (0.18 sec)
  🔹 Applying 30% dropout...
  ✅ Saved heart_dropout_30.h5ad (0.17 sec)

📌 Processing dataset: brain
  🔹 Applying 10% dropout...
  ✅ Saved brain_dropout_10.h5ad (1.04 sec)
  🔹 Applying 30% dropout...
  ✅ Saved brain_dropout_30.h5ad (1.01 sec)

🎯 All noisy datasets successfully created and saved!

1-III-C. MTG¶

This data set includes single-nucleus transcriptomes from 166,868 total nuclei derived from 5 post-mortem human brain specimens, to survey cell type diversity in the middle temporal gyrus (MTG). In total, 127 transcriptomic supertypes were identified.

https://portal.brain-map.org/atlases-and-data/rnaseq/human-mtg-10x_sea-ad

cell-type label is: subclass_label

In [2]:
# Define directories
data_dir = "./"
output_dir = "./Cluster_Evaluation_Data/"

# Define benchmark datasets
rna_data_all = sc.read('./datasets/Reference_MTG_RNAseq_final-nuclei.2022-06-07.h5ad')
In [1]:
import numpy as np
import scanpy as sc

# Define fractions for the final subsets
fractions = [0.1, 0.2, 0.4, 0.6, 0.8, 1.0]

# Step 1: Shuffle All Cell Indices Once (SUPER FAST!)
total_cells = rna_data_all.n_obs
shuffled_indices = np.random.permutation(total_cells)

# Step 2: Directly Extract Subsets from Shuffled Data (ZERO LOOPS!)
for fraction in fractions:
    subset_size = int(fraction * total_cells)  # Compute subset size
    selected_indices = shuffled_indices[:subset_size]  # Take first `N%` indices

    # Extract and save subset
    subset = rna_data_all[selected_indices, :].copy()
    subset.write_h5ad(f"{output_dir}MTG_cells_subset_{int(fraction*100)}.h5ad")

    print(f"Subset {int(fraction*100)}% saved successfully with {subset_size} cells.")

print("Randomized direct splitting completed for all subsets.")
Subset 10% saved successfully with 13730 cells.
Subset 20% saved successfully with 27460 cells.
Subset 40% saved successfully with 54921 cells.
Subset 60% saved successfully with 82381 cells.
Subset 80% saved successfully with 109842 cells.
Subset 100% saved successfully with 137303 cells.
Randomized direct splitting completed for all subsets.