import pandas as pd
import scanpy as sc
import geosketch
import numpy as np

read_path = './datasets/groundGan/PBMC-ALL_100K_cells.h5ad'
rna_data_all = sc.read_h5ad(read_path)
rna_data_all

AnnData object with n_obs × n_vars = 100000 × 1000

print('Raw Data:')
print(f'rna_data_all.shape:, {rna_data_all.shape}')
rna_data_all.var_names_make_unique()
rna_data_all.obs_names_make_unique()
# mitochondrial genes, "MT-" for human, "Mt-" for mouse
rna_data_all.var["mt"] = rna_data_all.var_names.str.startswith("MT-")
# ribosomal genes
rna_data_all.var["ribo"] = rna_data_all.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes
rna_data_all.var["hb"] = rna_data_all.var_names.str.contains("^HB[^(P)]")
sc.pp.calculate_qc_metrics(rna_data_all, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True)
sc.pp.filter_cells(rna_data_all, min_genes=10)
sc.pp.filter_genes(rna_data_all, min_cells=3)
sc.pp.normalize_total(rna_data_all)
sc.pp.log1p(rna_data_all)
print('Final Data:')
print(f'rna_data_all.shape:, {rna_data_all.shape}')

Raw Data:
rna_data_all.shape:, (100000, 1000)
Final Data:
rna_data_all.shape:, (99998, 986)

import geosketch
import numpy as np
import scanpy as sc

# Load full dataset (rna_data_all) assumed to be already loaded
num_subsets = 5
desired_subset_size = rna_data_all.shape[0] // num_subsets  # 20K per subset
random_seed = 42

# Step 1: Cluster the Data
sc.pp.neighbors(rna_data_all, use_rep="X")
sc.tl.leiden(rna_data_all, resolution=1.0, flavor="igraph", n_iterations=2)
rna_data_all.obs["pseudo_cluster"] = rna_data_all.obs["leiden"]

pseudo_clusters = rna_data_all.obs["pseudo_cluster"].unique()
split_datasets = [[] for _ in range(num_subsets)]

# Step 2: Proportional Stratified Sampling within Each Pseudo-Cluster
total_cells = rna_data_all.n_obs

for cluster in pseudo_clusters:
    # Get indices of cells in the current cluster
    cluster_indices = np.where(rna_data_all.obs["pseudo_cluster"] == cluster)[0]
    n_cluster = len(cluster_indices)
    
    # Compute the target number for this cluster, per subset, in proportion to its size
    # (rounding may be applied; you can adjust rounding as needed)
    target_cells_cluster = int((n_cluster / total_cells) * desired_subset_size)
    
    # To ensure you don't sample more than available, target should be at most the number of cells in that cluster.
    target_cells_cluster = min(target_cells_cluster, n_cluster)
    
    # If the cluster is too small to meet the target, you might choose to take all cells or a subset.
    # Here, we'll use all cells if n_cluster is less than the target.
    if target_cells_cluster == 0:
        continue

    for i in range(num_subsets):
        # Apply geosketch with a seed offset for reproducibility
        sketch_indices = geosketch.gs(rna_data_all[cluster_indices].X,
                                      target_cells_cluster,
                                      replace=False,
                                      seed=random_seed + i)
        # Map back to global indices
        selected_indices = cluster_indices[sketch_indices]
        split_datasets[i].extend(selected_indices)

# Step 3: Convert selected indices into separate AnnData objects
for i in range(num_subsets):
    subset = rna_data_all[split_datasets[i], :].copy()
    subset.write_h5ad(f"./datasets/groundGan/PBMC-ALL_100K_cells_subset_{i+1}.h5ad")
    
print("Proportional stratified sketching applied. Datasets successfully saved.")

Proportional stratified sketching applied. Datasets successfully saved.

import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt

# Define file paths
data_dir = "./datasets/groundGan/"
dataset_paths = [f"{data_dir}PBMC-ALL_100K_cells_subset_{i+1}.h5ad" for i in range(5)]

# Load datasets and compute mean expression
mean_expressions = []
for path in dataset_paths:
    adata = sc.read_h5ad(path)
    mean_expressions.append(np.array(adata.X.mean(axis=0)).flatten())

# Convert to DataFrame
mean_expr_df = pd.DataFrame(np.vstack(mean_expressions).T)

# Compute Pearson correlation between datasets
correlation_matrix = mean_expr_df.corr(method="pearson")

# Plot heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f",
            xticklabels=[f"Subset {i+1}" for i in range(5)],
            yticklabels=[f"Subset {i+1}" for i in range(5)])
plt.title("Correlation Between Dataset Subsets")
plt.show()

import scanpy as sc
import matplotlib.pyplot as plt

# Define file paths
data_dir = "./datasets/groundGan/"
dataset_paths = [f"{data_dir}PBMC-ALL_100K_cells_subset_{i+1}.h5ad" for i in range(5)]

# Set up the figure for multiple UMAPs
fig, axes = plt.subplots(1, 5, figsize=(20, 4))

# Process each dataset
for i, path in enumerate(dataset_paths):
    adata = sc.read_h5ad(path)

    # Compute UMAP if not already computed
    if 'X_umap' not in adata.obsm:
        sc.pp.neighbors(adata, use_rep="X")  # Compute neighbors using gene expression
        sc.tl.umap(adata)  # Compute UMAP

    # Plot UMAP
    sc.pl.umap(adata, ax=axes[i], show=False, title=f"Subset {i+1}")

plt.tight_layout()
plt.show()

rna_data_all = sc.read_h5ad('./datasets/groundGan/Tumor-ALL_100K_cells.h5ad')
rna_data_all

AnnData object with n_obs × n_vars = 100000 × 1000

print('Raw Data:')
print(f'rna_data_all.shape:, {rna_data_all.shape}')
rna_data_all.var_names_make_unique()
rna_data_all.obs_names_make_unique()
# mitochondrial genes, "MT-" for human, "Mt-" for mouse
rna_data_all.var["mt"] = rna_data_all.var_names.str.startswith("MT-")
# ribosomal genes
rna_data_all.var["ribo"] = rna_data_all.var_names.str.startswith(("RPS", "RPL"))
# hemoglobin genes
rna_data_all.var["hb"] = rna_data_all.var_names.str.contains("^HB[^(P)]")
sc.pp.calculate_qc_metrics(rna_data_all, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True)
sc.pp.filter_cells(rna_data_all, min_genes=10)
sc.pp.filter_genes(rna_data_all, min_cells=3)
sc.pp.normalize_total(rna_data_all)
sc.pp.log1p(rna_data_all)
print('Final Data:')
print(f'rna_data_all.shape:, {rna_data_all.shape}')

Raw Data:
rna_data_all.shape:, (100000, 1000)
Final Data:
rna_data_all.shape:, (100000, 836)

import geosketch
import numpy as np
import scanpy as sc

# Load full dataset (rna_data_all)
num_subsets = 5
desired_subset_size = rna_data_all.shape[0] // num_subsets  # 20K per subset
random_seed = 42

# Step 1: Cluster the Data
sc.pp.neighbors(rna_data_all, use_rep="X")
sc.tl.leiden(rna_data_all, resolution=1.0, flavor="igraph", n_iterations=2)
rna_data_all.obs["pseudo_cluster"] = rna_data_all.obs["leiden"]

pseudo_clusters = rna_data_all.obs["pseudo_cluster"].unique()
split_datasets = [[] for _ in range(num_subsets)]

# Step 2: Proportional Stratified Sampling within Each Pseudo-Cluster
total_cells = rna_data_all.n_obs

for cluster in pseudo_clusters:
    # Get indices of cells in the current cluster
    cluster_indices = np.where(rna_data_all.obs["pseudo_cluster"] == cluster)[0]
    n_cluster = len(cluster_indices)
    
    # Compute the target number for this cluster, per subset, in proportion to its size
    # (rounding may be applied; you can adjust rounding as needed)
    target_cells_cluster = int((n_cluster / total_cells) * desired_subset_size)
    
    # To ensure you don't sample more than available, target should be at most the number of cells in that cluster.
    target_cells_cluster = min(target_cells_cluster, n_cluster)
    
    # If the cluster is too small to meet the target, you might choose to take all cells or a subset.
    # Here, we'll use all cells if n_cluster is less than the target.
    if target_cells_cluster == 0:
        continue

    for i in range(num_subsets):
        # Apply geosketch with a seed offset for reproducibility
        sketch_indices = geosketch.gs(rna_data_all[cluster_indices].X,
                                      target_cells_cluster,
                                      replace=False,
                                      seed=random_seed + i)
        # Map back to global indices
        selected_indices = cluster_indices[sketch_indices]
        split_datasets[i].extend(selected_indices)

# 📝 Step 3: Convert selected indices into separate AnnData objects
for i in range(num_subsets):
    subset = rna_data_all[split_datasets[i], :].copy()
    subset.write_h5ad(f"./datasets/groundGan/Tumor-ALL_100K_cells_subset_{i+1}.h5ad")

print("Stratified sketching using pseudo-clusters applied. Datasets successfully saved.")

Stratified sketching using pseudo-clusters applied. Datasets successfully saved.

import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt

# Define file paths
data_dir = "./datasets/groundGan/"
dataset_paths = [f"{data_dir}Tumor-ALL_100K_cells_subset_{i+1}.h5ad" for i in range(5)]

# Load datasets and compute mean expression
mean_expressions = []
for path in dataset_paths:
    adata = sc.read_h5ad(path)
    mean_expressions.append(np.array(adata.X.mean(axis=0)).flatten())

# Convert to DataFrame
mean_expr_df = pd.DataFrame(np.vstack(mean_expressions).T)

# Compute Pearson correlation between datasets
correlation_matrix = mean_expr_df.corr(method="pearson")

# Plot heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f",
            xticklabels=[f"Subset {i+1}" for i in range(5)],
            yticklabels=[f"Subset {i+1}" for i in range(5)])
plt.title("Correlation Between Dataset Subsets")
plt.show()

import scanpy as sc
import matplotlib.pyplot as plt

# Define file paths
data_dir = "./datasets/groundGan/"
dataset_paths = [f"{data_dir}Tumor-ALL_100K_cells_subset_{i+1}.h5ad" for i in range(5)]

# Set up the figure for multiple UMAPs
fig, axes = plt.subplots(1, 5, figsize=(20, 4))

# Process each dataset
for i, path in enumerate(dataset_paths):
    adata = sc.read_h5ad(path)

    # Compute UMAP if not already computed
    if 'X_umap' not in adata.obsm:
        sc.pp.neighbors(adata, use_rep="X")  # Compute neighbors using gene expression
        sc.tl.umap(adata)  # Compute UMAP

    # Plot UMAP
    sc.pl.umap(adata, ax=axes[i], show=False, title=f"Subset {i+1}")

plt.tight_layout()
plt.show()

rna_data_all = sc.read_h5ad('./datasets/groundGan/Dahlin_100K_cells.h5ad')
rna_data_all

AnnData object with n_obs × n_vars = 100000 × 1000

print('Raw Data:')
print(f'rna_data_all.shape:, {rna_data_all.shape}')
rna_data_all.var_names_make_unique()
rna_data_all.obs_names_make_unique()
# mitochondrial genes, "MT-" for human, "Mt-" for mouse
rna_data_all.var["mt"] = rna_data_all.var_names.str.startswith("Mt-")
# ribosomal genes
rna_data_all.var["ribo"] = rna_data_all.var_names.str.startswith(("Rps", "Rpl"))
# hemoglobin genes
rna_data_all.var["hb"] = rna_data_all.var_names.str.contains("^Hb[^p]")
sc.pp.calculate_qc_metrics(rna_data_all, qc_vars=["mt", "ribo", "hb"], inplace=True, log1p=True)
sc.pp.filter_cells(rna_data_all, min_genes=10)
sc.pp.filter_genes(rna_data_all, min_cells=3)
sc.pp.normalize_total(rna_data_all)
sc.pp.log1p(rna_data_all)
print('Final Data:')
print(f'rna_data_all.shape:, {rna_data_all.shape}')

Raw Data:
rna_data_all.shape:, (100000, 1000)
Final Data:
rna_data_all.shape:, (100000, 971)

import geosketch
import numpy as np
import scanpy as sc

# Load full dataset (rna_data_all)
num_subsets = 5
desired_subset_size = rna_data_all.shape[0] // num_subsets  # 20K per subset
random_seed = 42

# Step 1: Cluster the Data
sc.pp.neighbors(rna_data_all, use_rep="X")
sc.tl.leiden(rna_data_all, resolution=1.0, flavor="igraph", n_iterations=2)
rna_data_all.obs["pseudo_cluster"] = rna_data_all.obs["leiden"]

pseudo_clusters = rna_data_all.obs["pseudo_cluster"].unique()
split_datasets = [[] for _ in range(num_subsets)]

# Step 2: Proportional Stratified Sampling within Each Pseudo-Cluster
total_cells = rna_data_all.n_obs

for cluster in pseudo_clusters:
    # Get indices of cells in the current cluster
    cluster_indices = np.where(rna_data_all.obs["pseudo_cluster"] == cluster)[0]
    n_cluster = len(cluster_indices)
    
    # Compute the target number for this cluster, per subset, in proportion to its size
    # (rounding may be applied; you can adjust rounding as needed)
    target_cells_cluster = int((n_cluster / total_cells) * desired_subset_size)
    
    # To ensure you don't sample more than available, target should be at most the number of cells in that cluster.
    target_cells_cluster = min(target_cells_cluster, n_cluster)
    
    # If the cluster is too small to meet the target, you might choose to take all cells or a subset.
    # Here, we'll use all cells if n_cluster is less than the target.
    if target_cells_cluster == 0:
        continue

    for i in range(num_subsets):
        # Apply geosketch with a seed offset for reproducibility
        sketch_indices = geosketch.gs(rna_data_all[cluster_indices].X,
                                      target_cells_cluster,
                                      replace=False,
                                      seed=random_seed + i)
        # Map back to global indices
        selected_indices = cluster_indices[sketch_indices]
        split_datasets[i].extend(selected_indices)

# 📝 Step 3: Convert selected indices into separate AnnData objects
for i in range(num_subsets):
    subset = rna_data_all[split_datasets[i], :].copy()
    # Save subset
    subset.write_h5ad(f"./datasets/groundGan/Dahlin_cells_subset_{i+1}.h5ad")

print("Datasets successfully split into disjoint, representative subsets using geosketch.")

Datasets successfully split into disjoint, representative subsets using geosketch.

import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt

# Define file paths
data_dir = "./datasets/groundGan/"
dataset_paths = [f"{data_dir}Dahlin_cells_subset_{i+1}.h5ad" for i in range(5)]

# Load datasets and compute mean expression
mean_expressions = []
for path in dataset_paths:
    adata = sc.read_h5ad(path)
    mean_expressions.append(np.array(adata.X.mean(axis=0)).flatten())

# Convert to DataFrame
mean_expr_df = pd.DataFrame(np.vstack(mean_expressions).T)

# Compute Pearson correlation between datasets
correlation_matrix = mean_expr_df.corr(method="pearson")

# Plot heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f",
            xticklabels=[f"Subset {i+1}" for i in range(5)],
            yticklabels=[f"Subset {i+1}" for i in range(5)])
plt.title("Correlation Between Dataset Subsets")
plt.show()

import scanpy as sc
import matplotlib.pyplot as plt

# Define file paths
data_dir = "./datasets/groundGan/"
dataset_paths = [f"{data_dir}Dahlin_cells_subset_{i+1}.h5ad" for i in range(5)]

# Set up the figure for multiple UMAPs
fig, axes = plt.subplots(1, 5, figsize=(20, 4))

# Process each dataset
for i, path in enumerate(dataset_paths):
    adata = sc.read_h5ad(path)

    # Compute UMAP if not already computed
    if 'X_umap' not in adata.obsm:
        sc.pp.neighbors(adata, use_rep="X")  # Compute neighbors using gene expression
        sc.tl.umap(adata)  # Compute UMAP

    # Plot UMAP
    sc.pl.umap(adata, ax=axes[i], show=False, title=f"Subset {i+1}")

plt.tight_layout()
plt.show()

rna_data_all = sc.read('./datasets/Dixit_2016.h5ad')
sc.pp.filter_cells(rna_data_all, min_genes=10)
sc.pp.filter_genes(rna_data_all, min_cells=3)
sc.pp.subsample(rna_data_all, fraction=1, random_state=42)
rna_data_all

AnnData object with n_obs × n_vars = 99722 × 18531
    obs: 'cluster', 'guide', 'INTERGENIC1144056', 'INTERGENIC1216445', 'INTERGENIC216151', 'INTERGENIC393453', 'sgCREB1_2', 'sgCREB1_4', 'sgCREB1_5', 'sgE2F4_6', 'sgE2F4_7', 'sgEGR1_2', 'sgEGR1_3', 'sgEGR1_4', 'sgELF1_1', 'sgELF1_2', 'sgELF1_4', 'sgELF1_5', 'sgELK1_1', 'sgELK1_6', 'sgELK1_7', 'sgETS1_3', 'sgETS1_5', 'sgGABPA_1', 'sgGABPA_9', 'sgIRF1_2', 'sgIRF1_3', 'sgNR2C2_2', 'sgNR2C2_3', 'sgNR2C2_5', 'sgYY1_10', 'sgYY1_3', 'batch', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden', 'perturbation_name', 'perturbation_type', 'perturbation_value', 'perturbation_unit'
    var: 'index', 'n_cells', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'doi', 'hvg', 'leiden', 'neighbors', 'pca', 'preprocessing_nb_link', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

from IPython.display import display
# Initialize list to hold info
perturbation_info = []

# Loop through perturbations
for pert in perturbations.unique():
    if pert == 'control':
        pert_type = 'control'
        tf_count = 0
        same_tf = 'N/A'
    else:
        tfs = pert.split('+')
        tf_count = len(tfs)
        same_tf = 'same' if len(set(tfs)) == 1 else 'mixed'
        
        if tf_count == 1:
            pert_type = 'single'
        elif tf_count == 2:
            pert_type = 'double'
        elif tf_count == 3:
            pert_type = 'triple'
        else:
            pert_type = f'{tf_count}-ple'

    # Count number of cells for each perturbation
    cell_count = (perturbations == pert).sum()

    # Append results
    perturbation_info.append({
        'Perturbation': pert,
        'Type': pert_type,
        'TF_Count': tf_count,
        'Same_or_Mixed': same_tf,
        'Cell_Count': cell_count
    })

# Create DataFrame
perturb_df = pd.DataFrame(perturbation_info)

# Sort for better readability
perturb_df = perturb_df.sort_values(by=['Type', 'Same_or_Mixed', 'Cell_Count'], ascending=[True, True, False])

# Display the DataFrame in Jupyter Notebook
display(perturb_df)


# Filter for 'same' TF knockouts only
same_tf_df = perturb_df[perturb_df['Same_or_Mixed'] == 'same']
display(same_tf_df)

# Filter for double-same knockouts and control
double_same_or_control = perturb_df[
    ((perturb_df['Type'] == 'double') & (perturb_df['Same_or_Mixed'] == 'same')) | 
    (perturb_df['Type'] == 'control')
]['Perturbation']

# Subset the AnnData object
double_same_rna_data = rna_data_all[rna_data_all.obs['perturbation_name'].isin(double_same_or_control)].copy()

# Check the unique perturbations in the subset
print(double_same_rna_data.obs['perturbation_name'].unique())

['control', 'ELF1+ELF1', 'ETS1+ETS1', 'ELK1+ELK1', 'NR2C2+NR2C2', 'CREB1+CREB1', 'E2F4+E2F4', 'GABPA+GABPA', 'EGR1+EGR1', 'IRF1+IRF1']
Categories (10, object): ['CREB1+CREB1', 'E2F4+E2F4', 'EGR1+EGR1', 'ELF1+ELF1', ..., 'GABPA+GABPA', 'IRF1+IRF1', 'NR2C2+NR2C2', 'control']

double_same_rna_data.write_h5ad('Dixit_2016_clean.h5ad')

# Define directories
data_dir = "./"
output_dir = "./Cluster_Evaluation_Data/"

# Define benchmark datasets
benchmark_datasets = {
    "pbmc": {"source": "builtin"}, 
    "lung": {"path": f"{data_dir}lung.h5ad"}, 
    "heart": {"path": f"{data_dir}heart.h5ad"},
    "brain": {"path": f"{data_dir}brain.h5ad"},
}

import os
def preprocess_rna_data(rna_data):
    # Apply basic shuffling and filtering
    sc.pp.filter_cells(rna_data, min_genes=10)
    sc.pp.filter_genes(rna_data, min_cells=3)
    
    # Assign ground truth labels
    if "cell_ontology_class" in rna_data.obs:
        rna_data.obs["truth"] = rna_data.obs["cell_ontology_class"].copy()
        sc.pp.normalize_total(rna_data, target_sum=1e6)
        sc.pp.log1p(rna_data)
    elif "louvain" in rna_data.obs:
        rna_data.obs["truth"] = rna_data.obs["louvain"]
    return rna_data

# Training loop
benchmark_results = {}

for dataset_name, dataset_info in benchmark_datasets.items():
    print(f"\n🔹 Preprocessing {dataset_name}...")

    # Load RNA data
    if dataset_info.get("source") == "builtin":
        rna_data = sc.datasets.pbmc3k_processed()
        raw_X = rna_data.raw.X.copy()
        obs = rna_data.obs.copy()
        var = rna_data.raw.var.copy()
        rna_data = sc.AnnData(X=raw_X, obs=obs, var=var)
        rna_data = preprocess_rna_data(rna_data)
    else:
        rna_data = sc.read_h5ad(dataset_info["path"])
        rna_data.X = rna_data.raw.X.copy()
        rna_data._raw._var.rename(columns={'_index': 'features'}, inplace=True)
        rna_data = preprocess_rna_data(rna_data)

    # Save processed dataset using `os.path.join()`
    save_path = os.path.join(output_dir, f"{dataset_name}.h5ad")
    rna_data.write_h5ad(save_path)
    
    # Print success message
    print(f"✅ Saved {dataset_name} to {save_path}")

🔹 Preprocessing pbmc...
✅ Saved pbmc to /home/mehrdad/research/scRegulate/Cluster_Evaluation_Data/pbmc.h5ad

🔹 Preprocessing lung...
✅ Saved lung to /home/mehrdad/research/scRegulate/Cluster_Evaluation_Data/lung.h5ad

🔹 Preprocessing heart...
✅ Saved heart to /home/mehrdad/research/scRegulate/Cluster_Evaluation_Data/heart.h5ad

🔹 Preprocessing brain...
✅ Saved brain to /home/mehrdad/research/scRegulate/Cluster_Evaluation_Data/brain.h5ad

import numpy as np
import scipy.sparse as sp
import scanpy as sc
import os
import time

# =============================================================================
# Noise Injection Functions
# =============================================================================

def add_gaussian_noise(adata, noise_level=0.05, seed=42):
    """
    Injects Gaussian noise into a sparse AnnData object.

    Parameters:
    - adata: AnnData object (scRNA-seq data)
    - noise_level: Percentage of noise to add (e.g., 0.05 for 5% noise)
    - seed: Fixed random seed for reproducibility

    Returns:
    - Noisy AnnData object (new copy) with sparse format preserved
    """
    np.random.seed(seed)
    noisy_adata = adata.copy()

    # Convert sparse to dense for processing
    data_dense = noisy_adata.X.toarray() if sp.issparse(noisy_adata.X) else noisy_adata.X

    # Compute standard deviation of each gene (column-wise)
    std_devs = np.std(data_dense, axis=0)

    # Generate Gaussian noise (scaled by gene-specific std)
    noise = np.random.normal(loc=0, scale=noise_level * std_devs, size=data_dense.shape)

    # Add noise and clip negative values to zero
    data_noisy = np.clip(data_dense + noise, 0, None)

    # Convert back to sparse matrix to save memory
    noisy_adata.X = sp.csr_matrix(data_noisy)

    return noisy_adata

def add_dropout_noise(adata, dropout_rate=0.2, seed=42):
    """
    Simulates dropout noise by randomly setting a fraction of values to zero.

    Parameters:
    - adata: AnnData object
    - dropout_rate: Fraction of values to drop (0.1 = 10% dropout)
    - seed: Random seed for reproducibility

    Returns:
    - AnnData object with dropout noise
    """
    np.random.seed(seed)
    noisy_adata = adata.copy()

    # Convert sparse matrix to dense for processing
    data_dense = noisy_adata.X.toarray() if sp.issparse(noisy_adata.X) else noisy_adata.X

    # Generate a dropout mask (1 = keep, 0 = drop)
    dropout_mask = np.random.rand(*data_dense.shape) > dropout_rate

    # Apply dropout noise
    data_noisy = data_dense * dropout_mask

    # Convert back to sparse matrix
    noisy_adata.X = sp.csr_matrix(data_noisy)

    return noisy_adata

# =============================================================================
# Dataset Processing
# =============================================================================

# Define directories
data_dir = "./Cluster_Evaluation_Data/"
output_dir = "./Cluster_Evaluation_Data/"

# Define benchmark datasets
benchmark_datasets = {
    "pbmc": {"path": f"{data_dir}pbmc.h5ad"},
    "lung": {"path": f"{data_dir}lung.h5ad"},
    "heart": {"path": f"{data_dir}heart.h5ad"},
    "brain": {"path": f"{data_dir}brain.h5ad"},
}

# Dropout levels to test
dropout_levels = [0.1, 0.3]

for dataset_name, dataset_info in benchmark_datasets.items():
    print(f"\n📌 Processing dataset: {dataset_name}")

    # Load dataset
    original_data = sc.read_h5ad(dataset_info["path"])

    for dropout_level in dropout_levels:
        print(f"  🔹 Applying {int(dropout_level * 100)}% dropout...")

        start_time = time.time()

        # Apply dropout noise
        noisy_data = add_dropout_noise(original_data, dropout_rate=dropout_level, seed=42)

        # Define output filename
        output_filename = f"{dataset_name}_dropout_{int(dropout_level * 100)}.h5ad"
        output_path = os.path.join(output_dir, output_filename)

        # Save noisy dataset
        noisy_data.write_h5ad(output_path)

        elapsed_time = time.time() - start_time
        print(f"  ✅ Saved {output_filename} ({elapsed_time:.2f} sec)")

print("\n🎯 All noisy datasets successfully created and saved!")

📌 Processing dataset: pbmc
  🔹 Applying 10% dropout...
  ✅ Saved pbmc_dropout_10.h5ad (0.40 sec)
  🔹 Applying 30% dropout...
  ✅ Saved pbmc_dropout_30.h5ad (0.37 sec)

📌 Processing dataset: lung
  🔹 Applying 10% dropout...
  ✅ Saved lung_dropout_10.h5ad (1.27 sec)
  🔹 Applying 30% dropout...
  ✅ Saved lung_dropout_30.h5ad (1.22 sec)

📌 Processing dataset: heart
  🔹 Applying 10% dropout...
  ✅ Saved heart_dropout_10.h5ad (0.18 sec)
  🔹 Applying 30% dropout...
  ✅ Saved heart_dropout_30.h5ad (0.17 sec)

📌 Processing dataset: brain
  🔹 Applying 10% dropout...
  ✅ Saved brain_dropout_10.h5ad (1.04 sec)
  🔹 Applying 30% dropout...
  ✅ Saved brain_dropout_30.h5ad (1.01 sec)

🎯 All noisy datasets successfully created and saved!

# Define directories
data_dir = "./"
output_dir = "./Cluster_Evaluation_Data/"

# Define benchmark datasets
rna_data_all = sc.read('./datasets/Reference_MTG_RNAseq_final-nuclei.2022-06-07.h5ad')

import numpy as np
import scanpy as sc

# Define fractions for the final subsets
fractions = [0.1, 0.2, 0.4, 0.6, 0.8, 1.0]

# Step 1: Shuffle All Cell Indices Once (SUPER FAST!)
total_cells = rna_data_all.n_obs
shuffled_indices = np.random.permutation(total_cells)

# Step 2: Directly Extract Subsets from Shuffled Data (ZERO LOOPS!)
for fraction in fractions:
    subset_size = int(fraction * total_cells)  # Compute subset size
    selected_indices = shuffled_indices[:subset_size]  # Take first `N%` indices

    # Extract and save subset
    subset = rna_data_all[selected_indices, :].copy()
    subset.write_h5ad(f"{output_dir}MTG_cells_subset_{int(fraction*100)}.h5ad")

    print(f"Subset {int(fraction*100)}% saved successfully with {subset_size} cells.")

print("Randomized direct splitting completed for all subsets.")

Subset 10% saved successfully with 13730 cells.
Subset 20% saved successfully with 27460 cells.
Subset 40% saved successfully with 54921 cells.
Subset 60% saved successfully with 82381 cells.
Subset 80% saved successfully with 109842 cells.
Subset 100% saved successfully with 137303 cells.
Randomized direct splitting completed for all subsets.

Bechamrking Procedures for scRegualte¶

Part 1 (Data Preparation - SEED=42)¶

1-I. GoundGAN Synthetic Data¶

1-I-A. PBMC - Human¶

Steps:¶

1-I-B. Tumor-ALL (Han et al., 2022) - Human¶

1-I-C. Dahlin (Dahlin et al., 2018) - Mouse¶

1-II. Perturb-seq Data¶

1-II-A Dixit et al 2016¶

1-III. Benchmark Data for Clustering¶

1-III-A. Preprocessing for benchmarking¶

Data consisting in this section:¶

1-III-B. Simulating 10% and 30% Dropout Effect¶

Data consisting in this section:¶

1-III-C. MTG¶

	Perturbation	Type	TF_Count	Same_or_Mixed	Cell_Count
386	INTERGENIC1216445+E2F4+ELK1+ETS1	4-ple	4	mixed	2
512	EGR1+ELF1+ELK1+NR2C2	4-ple	4	mixed	2
524	CREB1+ELF1+ETS1+YY1	4-ple	4	mixed	2
140	E2F4+EGR1+NR2C2+YY1	4-ple	4	mixed	1
231	INTERGENIC393453+E2F4+ELK1+NR2C2	4-ple	4	mixed	1
...	...	...	...	...	...
639	ELF1+NR2C2+YY10	triple	3	mixed	1
640	INTERGENIC1144056+ELK1+YY10	triple	3	mixed	1
470	EGR1+EGR1+EGR1	triple	3	same	1
492	ELF1+ELF1+ELF1	triple	3	same	1
581	CREB1+CREB1+CREB1	triple	3	same	1

	Perturbation	Type	TF_Count	Same_or_Mixed	Cell_Count
39	ELF1+ELF1	double	2	same	219
108	ELK1+ELK1	double	2	same	82
139	CREB1+CREB1	double	2	same	79
235	EGR1+EGR1	double	2	same	53
109	NR2C2+NR2C2	double	2	same	45
151	GABPA+GABPA	double	2	same	30
43	ETS1+ETS1	double	2	same	28
144	E2F4+E2F4	double	2	same	18
344	IRF1+IRF1	double	2	same	7
0	ELF1	single	1	same	11676
17	CREB1	single	1	same	7587
8	ELK1	single	1	same	7366
16	INTERGENIC393453	single	1	same	7177
14	EGR1	single	1	same	6181
15	GABPA	single	1	same	5462
7	NR2C2	single	1	same	5030
4	ETS1	single	1	same	4815
10	E2F4	single	1	same	4516
9	IRF1	single	1	same	3837
18	INTERGENIC1144056	single	1	same	3518
22	INTERGENIC216151	single	1	same	3518
12	INTERGENIC1216445	single	1	same	2758
26	YY1	single	1	same	2499
3	YY10	single	1	same	2092
470	EGR1+EGR1+EGR1	triple	3	same	1
492	ELF1+ELF1+ELF1	triple	3	same	1
581	CREB1+CREB1+CREB1	triple	3	same	1