Skip to content

Commit b0b3d3a

Browse files
author
Andrew Ramirez
committed
Simplify and pseudobulk all genes for cm
1 parent 36277af commit b0b3d3a

1 file changed

Lines changed: 31 additions & 188 deletions

File tree

RISE/figures/figureWong.py

Lines changed: 31 additions & 188 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,13 @@
11
"""
2-
Figure 5a_e Generation Scri # Get the final dataframe with mean gene expression for each sample
3-
# for top/bottom weighted overlapping genes in CM population only
4-
geneAmount = 500
5-
This module generates a comprehensive figure comparing PCA and PF2 component analyses
6-
for gene loadings and factors.
2+
Figure Wong Generation Script
3+
4+
This module generates mean expression data for CM cells across all genes.
75
"""
86

97
import anndata
108
import numpy as np
119
import pandas as pd
12-
import seaborn as sns
13-
from scipy import stats
14-
import mygene
1510
from .common import getSetup, subplotLabel
16-
import statsmodels.api as sm
1711
from ..imports import import_lupus
1812

1913

@@ -27,159 +21,59 @@ def makeFigure():
2721
print("Available cell types in dataset:")
2822
print(X.obs["Cell Type"].value_counts())
2923

30-
pc1_load = load_pc_loadings(pc_component=1)
31-
pc2_load = load_pc_loadings(pc_component=2)
32-
3324
# Get the final dataframe with mean gene expression for each sample
34-
# for top/bottom weighted overlapping genes in nCM and CM populations only
35-
geneAmount = 500
36-
df_final, selected_genes = get_mean_expression_overlapping_genes(X, pc1_load, pc2_load, geneAmount)
25+
# for CM population across all genes
26+
df_final = get_mean_expression_cm_cells(X)
27+
print(f"Final dataframe shape: {df_final.shape}")
28+
print(df_final.head())
29+
30+
if not df_final.empty:
31+
df_final.to_csv("lupus_mean_expression.csv", index=False)
32+
print("Data saved to lupus_mean_expression.csv")
33+
else:
34+
print("No data to save - dataframe is empty")
35+
3736
print(df_final)
38-
print(selected_genes)
39-
40-
# df_final.to_csv("lupus_mean_expression.csv", index=False)
41-
42-
# print(df_final)
43-
# print(f"Final dataframe shape: {df_final.shape}")
44-
# print(f"Number of selected genes: {len(selected_genes)}")
45-
# print(f"Selected genes: {selected_genes}")
46-
# print("\nFinal dataframe with mean gene expression per sample:")
47-
# print(df_final.head(10))
48-
# print(f"\nColumns in final dataframe: {list(df_final.columns)}")
4937

5038
return f
5139

5240

53-
54-
def load_pc_loadings(pc_component: int) -> pd.DataFrame:
55-
"""
56-
Load and preprocess PC loadings for a specific component.
57-
58-
Args:
59-
pc_component (int): Principal Component number (1 or 2)
60-
geneAmount (int): Number of genes to consider
61-
62-
Returns:
63-
pd.DataFrame: Processed PC loadings DataFrame
41+
def get_mean_expression_cm_cells(X):
6442
"""
65-
66-
df = pd.read_csv(f"loadings_time_series_PC{pc_component}.csv", dtype=str)
67-
df = df.rename(columns={"Unnamed: 0": "Gene"})
68-
df["Gene"] = convert_gene_symbols(df["Gene"])[0]
69-
df[f"PC{pc_component}"] = stats.zscore(df[f"PC{pc_component}"].astype(float))
70-
71-
return df
72-
73-
74-
def convert_gene_symbols(genes):
75-
"""
76-
Convert gene symbols using MyGene.
77-
78-
Args:
79-
genes (list): List of gene symbols
80-
species (str): Species for gene conversion
81-
82-
Returns:
83-
tuple: Converted genes and list of genes without hits
84-
"""
85-
mg = mygene.MyGeneInfo()
86-
results = mg.querymany(
87-
genes,
88-
scopes='symbol',
89-
fields=['symbol', 'entrezgene'],
90-
species='mouse',
91-
transformed=True
92-
)
93-
94-
conversion_map = []
95-
no_hit_genes = []
96-
97-
for gene, result in zip(genes, results):
98-
if result and 'symbol' in result:
99-
conversion_map.append(result.get('symbol', gene).upper())
100-
else:
101-
conversion_map.append(gene.upper())
102-
no_hit_genes.append(gene)
103-
104-
return conversion_map, no_hit_genes
105-
106-
107-
def get_mean_expression_overlapping_genes(X, pc1_load, pc2_load, geneAmount=10):
108-
"""
109-
Get final dataframe with mean gene expression for each sample for top/bottom weighted overlapping genes
110-
in CM population only.
43+
Get final dataframe with mean gene expression for each sample for CM cells across all genes.
11144
11245
Args:
11346
X: AnnData object with lupus data
114-
pc1_load: PC1 loadings DataFrame
115-
pc2_load: PC2 loadings DataFrame
116-
geneAmount: Number of top and bottom genes to include
11747
11848
Returns:
119-
pd.DataFrame: Final dataframe with mean expression per sample for selected genes
120-
list: List of selected gene names
49+
pd.DataFrame: Final dataframe with mean expression per sample for all genes
12150
"""
122-
# Find overlapping genes between PC loadings and dataset
123-
pc1_genes = set(pc1_load['Gene'])
124-
pc2_genes = set(pc2_load['Gene'])
125-
dataset_genes = set(X.var_names)
126-
127-
# Get genes that overlap between PC1, PC2, and the dataset
128-
overlap_genes = list(pc1_genes & pc2_genes & dataset_genes)
129-
130-
print(f"Found {len(overlap_genes)} overlapping genes")
131-
132-
if not overlap_genes:
133-
raise ValueError("No overlapping genes found between PC loadings and dataset")
134-
135-
# Get top and bottom weighted genes from PC1 and PC2
136-
selected_genes = []
137-
138-
for pc_load, pc_name in [(pc1_load, "PC1"), (pc2_load, "PC2")]:
139-
# Filter to overlapping genes only
140-
pc_overlap = pc_load[pc_load['Gene'].isin(overlap_genes)].copy()
141-
142-
# Sort by PC weights and get top and bottom genes
143-
pc_sorted = pc_overlap.sort_values(by=pc_name, ascending=False)
144-
top_genes = pc_sorted['Gene'].head(geneAmount).tolist()
145-
bottom_genes = pc_sorted['Gene'].tail(geneAmount).tolist()
146-
147-
selected_genes.extend(top_genes)
148-
selected_genes.extend(bottom_genes)
149-
150-
# Remove duplicates while preserving order
151-
selected_genes = list(dict.fromkeys(selected_genes))
152-
153-
print(f"Selected {len(selected_genes)} genes after filtering for top/bottom weighted")
154-
155-
# Get expression data for selected genes
156-
genesV = X[:, selected_genes]
157-
df = genesV.to_df()
51+
# Get expression data for all genes
52+
df = X.to_df()
15853

15954
# Add metadata
16055
df["Status"] = X.obs["SLE_status"].values
16156
df["Condition"] = X.obs["Condition"].values
16257
df["Cell Type"] = X.obs["Cell Type"].values
163-
164-
# Debug: Check what cell types are available
165-
print("Available cell types:")
166-
print(df["Cell Type"].value_counts())
167-
168-
# Filter to only CM cells
169-
df_filtered = df[df["Cell Type"] == "cM"]
170-
print(f"After filtering for CM cells only: {len(df_filtered)} rows")
58+
59+
# Filter to only CM cells (try both "CM" and "cM" in case of different naming)
60+
df_filtered = df[df["Cell Type"].isin(["cM"])]
61+
print(f"After filtering for CM cells: {len(df_filtered)} rows")
17162

17263
if len(df_filtered) == 0:
17364
print("No CM cells found! Available cell types:")
17465
print(df["Cell Type"].unique())
175-
return pd.DataFrame(), selected_genes
66+
return pd.DataFrame()
17667

177-
df = df_filtered
68+
# Get all gene columns (exclude metadata columns)
69+
metadata_cols = ["Status", "Condition", "Cell Type"]
70+
gene_cols = [col for col in df_filtered.columns if col not in metadata_cols]
71+
72+
print(f"Processing {len(gene_cols)} genes")
17873

17974
# Group by sample (Condition) and cell type to get mean expression per sample for each gene
18075
groupby_cols = ["Status", "Cell Type", "Condition"]
181-
gene_cols = selected_genes
182-
df_final = df.groupby(groupby_cols, observed=False)[gene_cols].mean().reset_index()
76+
df_final = df_filtered.groupby(groupby_cols, observed=False)[gene_cols].mean().reset_index()
18377
df_final = df_final.dropna().sort_values(["Cell Type", "Condition"])
18478

18579
# Add cell count information
@@ -190,59 +84,8 @@ def get_mean_expression_overlapping_genes(X, pc1_load, pc2_load, geneAmount=10):
19084
df_final = df_final.merge(df_count, on=["Cell Type", "Condition"], how="left")
19185
df_final['Cell Type'] = df_final['Cell Type'].astype('category').cat.remove_unused_categories()
19286

193-
return df_final, selected_genes
194-
195-
196-
def avegene_lupus(X, genes, overlap_genes):
197-
"""Average gene expression of multiple overlapping genes in lupus samples for nCM and CM populations."""
198-
# Handle both single gene (string) and multiple genes (list)
199-
if isinstance(genes, str):
200-
genes = [genes]
201-
202-
# Filter genes to only include those that overlap and are present in the dataset
203-
available_genes = [gene for gene in genes if gene in overlap_genes and gene in X.var_names]
204-
205-
if not available_genes:
206-
raise ValueError(f"None of the provided genes {genes} are found in the overlap genes or dataset")
207-
208-
# Get expression data for the available overlapping genes
209-
genesV = X[:, available_genes]
210-
df = genesV.to_df()
87+
return df_final
21188

212-
# Add metadata
213-
df["Status"] = X.obs["SLE_status"].values
214-
df["Condition"] = X.obs["Condition"].values
215-
df["Cell Type"] = X.obs["Cell Type"].values
216-
217-
# Filter to only nCM and CM populations
218-
df = df[df["Cell Type"].isin(["nCM", "CM"])]
219-
220-
# If multiple genes, compute their average expression
221-
if len(available_genes) > 1:
222-
# Create a new column with the average expression across overlapping genes
223-
gene_avg_name = f"avg_{len(available_genes)}_overlapping_genes"
224-
df[gene_avg_name] = df[available_genes].mean(axis=1)
225-
# Keep only the average column and metadata
226-
expression_cols = [gene_avg_name]
227-
else:
228-
# Single gene case
229-
gene_avg_name = available_genes[0]
230-
expression_cols = available_genes
231-
232-
# Group by sample (Condition) and cell type to get mean expression per sample
233-
groupby_cols = ["Status", "Cell Type", "Condition"]
234-
df_mean = df.groupby(groupby_cols, observed=False)[expression_cols + available_genes].mean().reset_index()
235-
df_mean = df_mean.dropna().sort_values(["Cell Type", "Condition"])
236-
237-
# Add cell count information
238-
df_count = df.groupby(["Cell Type", "Condition"], observed=False).size().reset_index(
239-
name="Cell Count").sort_values(["Cell Type", "Condition"])
240-
df_count = df_count[df_count["Cell Type"].isin(["nCM", "CM"])]
24189

242-
# Merge cell count with mean expression data
243-
df_final = df_mean.merge(df_count, on=["Cell Type", "Condition"], how="left")
244-
df_final['Cell Type'] = df_final['Cell Type'].astype('category').cat.remove_unused_categories()
245-
246-
return df_final, gene_avg_name, available_genes
24790

24891

0 commit comments

Comments
 (0)