11"""
2- Figure 5a_e Generation Scri # Get the final dataframe with mean gene expression for each sample
3- # for top/bottom weighted overlapping genes in CM population only
4- geneAmount = 500
5- This module generates a comprehensive figure comparing PCA and PF2 component analyses
6- for gene loadings and factors.
2+ Figure Wong Generation Script
3+
4+ This module generates mean expression data for CM cells across all genes.
75"""
86
97import anndata
108import numpy as np
119import pandas as pd
12- import seaborn as sns
13- from scipy import stats
14- import mygene
1510from .common import getSetup , subplotLabel
16- import statsmodels .api as sm
1711from ..imports import import_lupus
1812
1913
@@ -27,159 +21,59 @@ def makeFigure():
2721 print ("Available cell types in dataset:" )
2822 print (X .obs ["Cell Type" ].value_counts ())
2923
30- pc1_load = load_pc_loadings (pc_component = 1 )
31- pc2_load = load_pc_loadings (pc_component = 2 )
32-
3324 # Get the final dataframe with mean gene expression for each sample
34- # for top/bottom weighted overlapping genes in nCM and CM populations only
35- geneAmount = 500
36- df_final , selected_genes = get_mean_expression_overlapping_genes (X , pc1_load , pc2_load , geneAmount )
25+ # for CM population across all genes
26+ df_final = get_mean_expression_cm_cells (X )
27+ print (f"Final dataframe shape: { df_final .shape } " )
28+ print (df_final .head ())
29+
30+ if not df_final .empty :
31+ df_final .to_csv ("lupus_mean_expression.csv" , index = False )
32+ print ("Data saved to lupus_mean_expression.csv" )
33+ else :
34+ print ("No data to save - dataframe is empty" )
35+
3736 print (df_final )
38- print (selected_genes )
39-
40- # df_final.to_csv("lupus_mean_expression.csv", index=False)
41-
42- # print(df_final)
43- # print(f"Final dataframe shape: {df_final.shape}")
44- # print(f"Number of selected genes: {len(selected_genes)}")
45- # print(f"Selected genes: {selected_genes}")
46- # print("\nFinal dataframe with mean gene expression per sample:")
47- # print(df_final.head(10))
48- # print(f"\nColumns in final dataframe: {list(df_final.columns)}")
4937
5038 return f
5139
5240
53-
54- def load_pc_loadings (pc_component : int ) -> pd .DataFrame :
55- """
56- Load and preprocess PC loadings for a specific component.
57-
58- Args:
59- pc_component (int): Principal Component number (1 or 2)
60- geneAmount (int): Number of genes to consider
61-
62- Returns:
63- pd.DataFrame: Processed PC loadings DataFrame
41+ def get_mean_expression_cm_cells (X ):
6442 """
65-
66- df = pd .read_csv (f"loadings_time_series_PC{ pc_component } .csv" , dtype = str )
67- df = df .rename (columns = {"Unnamed: 0" : "Gene" })
68- df ["Gene" ] = convert_gene_symbols (df ["Gene" ])[0 ]
69- df [f"PC{ pc_component } " ] = stats .zscore (df [f"PC{ pc_component } " ].astype (float ))
70-
71- return df
72-
73-
74- def convert_gene_symbols (genes ):
75- """
76- Convert gene symbols using MyGene.
77-
78- Args:
79- genes (list): List of gene symbols
80- species (str): Species for gene conversion
81-
82- Returns:
83- tuple: Converted genes and list of genes without hits
84- """
85- mg = mygene .MyGeneInfo ()
86- results = mg .querymany (
87- genes ,
88- scopes = 'symbol' ,
89- fields = ['symbol' , 'entrezgene' ],
90- species = 'mouse' ,
91- transformed = True
92- )
93-
94- conversion_map = []
95- no_hit_genes = []
96-
97- for gene , result in zip (genes , results ):
98- if result and 'symbol' in result :
99- conversion_map .append (result .get ('symbol' , gene ).upper ())
100- else :
101- conversion_map .append (gene .upper ())
102- no_hit_genes .append (gene )
103-
104- return conversion_map , no_hit_genes
105-
106-
107- def get_mean_expression_overlapping_genes (X , pc1_load , pc2_load , geneAmount = 10 ):
108- """
109- Get final dataframe with mean gene expression for each sample for top/bottom weighted overlapping genes
110- in CM population only.
43+ Get final dataframe with mean gene expression for each sample for CM cells across all genes.
11144
11245 Args:
11346 X: AnnData object with lupus data
114- pc1_load: PC1 loadings DataFrame
115- pc2_load: PC2 loadings DataFrame
116- geneAmount: Number of top and bottom genes to include
11747
11848 Returns:
119- pd.DataFrame: Final dataframe with mean expression per sample for selected genes
120- list: List of selected gene names
49+ pd.DataFrame: Final dataframe with mean expression per sample for all genes
12150 """
122- # Find overlapping genes between PC loadings and dataset
123- pc1_genes = set (pc1_load ['Gene' ])
124- pc2_genes = set (pc2_load ['Gene' ])
125- dataset_genes = set (X .var_names )
126-
127- # Get genes that overlap between PC1, PC2, and the dataset
128- overlap_genes = list (pc1_genes & pc2_genes & dataset_genes )
129-
130- print (f"Found { len (overlap_genes )} overlapping genes" )
131-
132- if not overlap_genes :
133- raise ValueError ("No overlapping genes found between PC loadings and dataset" )
134-
135- # Get top and bottom weighted genes from PC1 and PC2
136- selected_genes = []
137-
138- for pc_load , pc_name in [(pc1_load , "PC1" ), (pc2_load , "PC2" )]:
139- # Filter to overlapping genes only
140- pc_overlap = pc_load [pc_load ['Gene' ].isin (overlap_genes )].copy ()
141-
142- # Sort by PC weights and get top and bottom genes
143- pc_sorted = pc_overlap .sort_values (by = pc_name , ascending = False )
144- top_genes = pc_sorted ['Gene' ].head (geneAmount ).tolist ()
145- bottom_genes = pc_sorted ['Gene' ].tail (geneAmount ).tolist ()
146-
147- selected_genes .extend (top_genes )
148- selected_genes .extend (bottom_genes )
149-
150- # Remove duplicates while preserving order
151- selected_genes = list (dict .fromkeys (selected_genes ))
152-
153- print (f"Selected { len (selected_genes )} genes after filtering for top/bottom weighted" )
154-
155- # Get expression data for selected genes
156- genesV = X [:, selected_genes ]
157- df = genesV .to_df ()
51+ # Get expression data for all genes
52+ df = X .to_df ()
15853
15954 # Add metadata
16055 df ["Status" ] = X .obs ["SLE_status" ].values
16156 df ["Condition" ] = X .obs ["Condition" ].values
16257 df ["Cell Type" ] = X .obs ["Cell Type" ].values
163-
164- # Debug: Check what cell types are available
165- print ("Available cell types:" )
166- print (df ["Cell Type" ].value_counts ())
167-
168- # Filter to only CM cells
169- df_filtered = df [df ["Cell Type" ] == "cM" ]
170- print (f"After filtering for CM cells only: { len (df_filtered )} rows" )
58+
59+ # Filter to only CM cells (try both "CM" and "cM" in case of different naming)
60+ df_filtered = df [df ["Cell Type" ].isin (["cM" ])]
61+ print (f"After filtering for CM cells: { len (df_filtered )} rows" )
17162
17263 if len (df_filtered ) == 0 :
17364 print ("No CM cells found! Available cell types:" )
17465 print (df ["Cell Type" ].unique ())
175- return pd .DataFrame (), selected_genes
66+ return pd .DataFrame ()
17667
177- df = df_filtered
68+ # Get all gene columns (exclude metadata columns)
69+ metadata_cols = ["Status" , "Condition" , "Cell Type" ]
70+ gene_cols = [col for col in df_filtered .columns if col not in metadata_cols ]
71+
72+ print (f"Processing { len (gene_cols )} genes" )
17873
17974 # Group by sample (Condition) and cell type to get mean expression per sample for each gene
18075 groupby_cols = ["Status" , "Cell Type" , "Condition" ]
181- gene_cols = selected_genes
182- df_final = df .groupby (groupby_cols , observed = False )[gene_cols ].mean ().reset_index ()
76+ df_final = df_filtered .groupby (groupby_cols , observed = False )[gene_cols ].mean ().reset_index ()
18377 df_final = df_final .dropna ().sort_values (["Cell Type" , "Condition" ])
18478
18579 # Add cell count information
@@ -190,59 +84,8 @@ def get_mean_expression_overlapping_genes(X, pc1_load, pc2_load, geneAmount=10):
19084 df_final = df_final .merge (df_count , on = ["Cell Type" , "Condition" ], how = "left" )
19185 df_final ['Cell Type' ] = df_final ['Cell Type' ].astype ('category' ).cat .remove_unused_categories ()
19286
193- return df_final , selected_genes
194-
195-
196- def avegene_lupus (X , genes , overlap_genes ):
197- """Average gene expression of multiple overlapping genes in lupus samples for nCM and CM populations."""
198- # Handle both single gene (string) and multiple genes (list)
199- if isinstance (genes , str ):
200- genes = [genes ]
201-
202- # Filter genes to only include those that overlap and are present in the dataset
203- available_genes = [gene for gene in genes if gene in overlap_genes and gene in X .var_names ]
204-
205- if not available_genes :
206- raise ValueError (f"None of the provided genes { genes } are found in the overlap genes or dataset" )
207-
208- # Get expression data for the available overlapping genes
209- genesV = X [:, available_genes ]
210- df = genesV .to_df ()
87+ return df_final
21188
212- # Add metadata
213- df ["Status" ] = X .obs ["SLE_status" ].values
214- df ["Condition" ] = X .obs ["Condition" ].values
215- df ["Cell Type" ] = X .obs ["Cell Type" ].values
216-
217- # Filter to only nCM and CM populations
218- df = df [df ["Cell Type" ].isin (["nCM" , "CM" ])]
219-
220- # If multiple genes, compute their average expression
221- if len (available_genes ) > 1 :
222- # Create a new column with the average expression across overlapping genes
223- gene_avg_name = f"avg_{ len (available_genes )} _overlapping_genes"
224- df [gene_avg_name ] = df [available_genes ].mean (axis = 1 )
225- # Keep only the average column and metadata
226- expression_cols = [gene_avg_name ]
227- else :
228- # Single gene case
229- gene_avg_name = available_genes [0 ]
230- expression_cols = available_genes
231-
232- # Group by sample (Condition) and cell type to get mean expression per sample
233- groupby_cols = ["Status" , "Cell Type" , "Condition" ]
234- df_mean = df .groupby (groupby_cols , observed = False )[expression_cols + available_genes ].mean ().reset_index ()
235- df_mean = df_mean .dropna ().sort_values (["Cell Type" , "Condition" ])
236-
237- # Add cell count information
238- df_count = df .groupby (["Cell Type" , "Condition" ], observed = False ).size ().reset_index (
239- name = "Cell Count" ).sort_values (["Cell Type" , "Condition" ])
240- df_count = df_count [df_count ["Cell Type" ].isin (["nCM" , "CM" ])]
24189
242- # Merge cell count with mean expression data
243- df_final = df_mean .merge (df_count , on = ["Cell Type" , "Condition" ], how = "left" )
244- df_final ['Cell Type' ] = df_final ['Cell Type' ].astype ('category' ).cat .remove_unused_categories ()
245-
246- return df_final , gene_avg_name , available_genes
24790
24891
0 commit comments