首页  

Alphalens FinMind Qlib 实例     所属分类 quant 浏览量 7
Alphalens  FinMind  Qlib  实例
基于A股数据  本地excel日线数据

项目架构设计
project/
├── data/
│   ├── raw/              # 原始Excel数据
│   ├── processed/        # 处理后的数据
│   └── features/         # 特征数据
├── notebooks/           # Jupyter Notebook分析
├── src/
│   ├── data_loader.py   # 数据加载模块
│   ├── factor_builder.py # 因子构建
│   ├── backtest.py      # 回测模块
│   └── utils.py         # 工具函数
├── config.py           # 配置文件
└── requirements.txt    # 依赖包



1. 环境准备 python # requirements.txt pandas>=1.5.0 numpy>=1.23.0 alphalens>=0.4.0 empyrical>=0.5.0 matplotlib>=3.6.0 seaborn>=0.12.0 qlib>=0.9.0 finmind>=2.0.0 openpyxl>=3.0.0 jupyter>=1.0.0 2. 数据加载模块 # src/data_loader.py import pandas as pd import numpy as np import os from datetime import datetime, timedelta import warnings warnings.filterwarnings('ignore') class AShareDataLoader: """A股数据加载器,支持本地Excel和FinMind API""" def __init__(self, data_path='data/raw/'): self.data_path = data_path def load_excel_data(self, file_path, sheet_name='日线数据'): """ 加载本地Excel日线数据 假设Excel格式包含以下列: ['date', 'code', 'open', 'high', 'low', 'close', 'volume', 'turnover'] """ df = pd.read_excel(file_path, sheet_name=sheet_name) # 标准化列名 column_mapping = { '日期': 'date', '股票代码': 'code', '开盘价': 'open', '最高价': 'high', '最低价': 'low', '收盘价': 'close', '成交量': 'volume', '成交额': 'turnover' } df.rename(columns=column_mapping, inplace=True) # 确保日期格式 df['date'] = pd.to_datetime(df['date']) # 标准化股票代码格式 (600001.SH) df['code'] = df['code'].astype(str) df['code'] = df['code'].apply(lambda x: self._standardize_code(x)) return df def load_multiple_excel(self, folder_path): """加载文件夹中的所有Excel文件""" all_data = [] for file in os.listdir(folder_path): if file.endswith(('.xlsx', '.xls')): file_path = os.path.join(folder_path, file) df = self.load_excel_data(file_path) all_data.append(df) if all_data: return pd.concat(all_data, ignore_index=True) else: raise ValueError(f"No Excel files found in {folder_path}") def _standardize_code(self, code): """标准化A股代码格式""" code = str(code).strip() if len(code) == 6: if code.startswith(('6', '9')): return f"{code}.SH" elif code.startswith(('0', '3')): return f"{code}.SZ" elif code.startswith('4'): return f"{code}.BJ" # 北交所 return code def fetch_finmind_data(self, stock_codes, start_date, end_date): """ 从FinMind API获取补充数据 """ try: from finmind.data import DataLoader api = DataLoader() all_data = [] for code in stock_codes: try: # 移除后缀获取原始代码 raw_code = code.split('.')[0] # 获取日线数据 df = api.taiwan_stock_daily( stock_id=raw_code, start_date=start_date, end_date=end_date ) if not df.empty: df['code'] = code all_data.append(df) except Exception as e: print(f"Error fetching data for {code}: {e}") continue if all_data: combined_df = pd.concat(all_data, ignore_index=True) # 重命名列以匹配我们的格式 column_mapping = { 'date': 'date', 'stock_id': 'code', 'open': 'open', 'max': 'high', 'min': 'low', 'close': 'close', 'Trading_Volume': 'volume', 'Trading_money': 'turnover' } combined_df.rename(columns=column_mapping, inplace=True) return combined_df except ImportError: print("FinMind not installed. Using local data only.") return None def create_panel_data(self, df, price_col='close'): """ 创建面板数据格式,为Alphalens准备 """ # 设置多级索引 (date, code) df_panel = df.set_index(['date', 'code'])[[price_col]] df_panel = df_panel.unstack().swaplevel(axis=1).sort_index(axis=1) return df_panel 3. 因子构建模块 # src/factor_builder.py import pandas as pd import numpy as np from scipy import stats class FactorFactory: """因子工厂类,构建各种量化因子""" @staticmethod def momentum_factor(prices, lookback_period=20): """动量因子:过去N日的收益率""" return prices.pct_change(lookback_period) @staticmethod def mean_reversion_factor(prices, lookback_period=20): """均值回复因子:价格与均值的偏离度""" ma = prices.rolling(window=lookback_period).mean() return (prices - ma) / ma @staticmethod def volume_factor(prices, volumes, lookback_period=10): """量价因子:价格变化与成交量的相关性""" returns = prices.pct_change() volume_change = volumes.pct_change() corr = returns.rolling(window=lookback_period).corr(volume_change) return corr @staticmethod def volatility_factor(prices, lookback_period=20): """波动率因子:历史波动率""" returns = prices.pct_change() volatility = returns.rolling(window=lookback_period).std() return volatility @staticmethod def liquidity_factor(volumes, prices, lookback_period=5): """流动性因子:平均换手率""" turnover = volumes / prices # 简化版换手率 avg_turnover = turnover.rolling(window=lookback_period).mean() return avg_turnover @staticmethod def technical_factors(df): """技术指标因子集合""" factors = {} # 计算收盘价相关因子 close = df['close'] high = df['high'] low = df['low'] volume = df['volume'] # RSI (相对强弱指数) delta = close.diff() gain = (delta.where(delta > 0, 0)).rolling(window=14).mean() loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean() rs = gain / loss factors['RSI'] = 100 - (100 / (1 + rs)) # MACD exp1 = close.ewm(span=12, adjust=False).mean() exp2 = close.ewm(span=26, adjust=False).mean() macd = exp1 - exp2 signal = macd.ewm(span=9, adjust=False).mean() factors['MACD'] = macd - signal # 布林带位置 ma = close.rolling(window=20).mean() std = close.rolling(window=20).std() factors['BB_position'] = (close - ma) / (2 * std) return pd.DataFrame(factors) 4. Alphalens分析模块 # src/alphalens_analysis.py import alphalens as al import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from datetime import datetime class AlphaLensAnalyzer: """使用Alphalens进行因子分析""" def __init__(self): plt.style.use('seaborn-v0_8-darkgrid') def prepare_factor_data(self, factor_data, prices, quantiles=5, periods=(1, 5, 10, 20)): """ 准备因子数据供Alphalens分析 参数: factor_data: DataFrame, 因子值,索引为(date, asset) prices: DataFrame, 价格数据,索引为date,列为asset quantiles: int, 分位数数量 periods: tuple, 持有期列表 """ # 确保时间索引正确 factor_data.index = pd.MultiIndex.from_tuples( [(pd.Timestamp(x[0]), x[1]) for x in factor_data.index] ) # 对齐价格数据 prices = prices.loc[factor_data.index.get_level_values(0).unique()] # 创建因子数据 factor = al.utils.get_clean_factor_and_forward_returns( factor=factor_data, prices=prices, quantiles=quantiles, periods=periods, max_loss=0.35 ) return factor def run_full_analysis(self, factor_data, factor_name="Factor"): """ 运行完整的因子分析 """ # 1. 分位数分组分析 mean_return_by_q, std_err_by_q = al.performance.mean_return_by_quantile( factor_data, by_group=False ) # 2. 计算IC(信息系数) ic = al.performance.factor_information_coefficient(factor_data) # 3. 分位数换手率 turnover = al.performance.quantile_turnover(factor_data) # 创建可视化 self.create_visualizations( factor_data, mean_return_by_q, ic, turnover, factor_name ) return { 'mean_return': mean_return_by_q, 'ic': ic, 'turnover': turnover } def create_visualizations(self, factor_data, mean_returns, ic, turnover, factor_name): """ 创建分析图表 """ fig, axes = plt.subplots(3, 2, figsize=(16, 18)) # 1. 分位数收益热图 al.plotting.plot_quantile_returns_bar(mean_returns, ax=axes[0, 0]) axes[0, 0].set_title(f'{factor_name} - Quantile Returns') # 2. 分位数收益时间序列 al.plotting.plot_cumulative_returns_by_quantile( mean_returns, period=5, ax=axes[0, 1] ) axes[0, 1].set_title(f'{factor_name} - Cumulative Returns by Quantile') # 3. IC分析 al.plotting.plot_ic_ts(ic, ax=axes[1, 0]) axes[1, 0].set_title(f'{factor_name} - Information Coefficient') al.plotting.plot_ic_hist(ic, ax=axes[1, 1]) axes[1, 1].set_title(f'{factor_name} - IC Distribution') # 4. 换手率分析 al.plotting.plot_turnover_hist(turnover, ax=axes[2, 0]) axes[2, 0].set_title(f'{factor_name} - Turnover Distribution') # 5. 分位数平均收益 al.plotting.plot_mean_quantile_returns(mean_returns, ax=axes[2, 1]) axes[2, 1].set_title(f'{factor_name} - Mean Quantile Returns') plt.tight_layout() plt.savefig(f'results/{factor_name}_analysis.png', dpi=300, bbox_inches='tight') plt.show() def create_factor_tear_sheet(self, factor_data, prices, factor_name="Custom_Factor"): """ 创建完整的因子分析报告 """ try: # 准备因子数据 factor_data_prepared = self.prepare_factor_data(factor_data, prices) # 运行Alphalens完整分析 al.tears.create_full_tear_sheet( factor_data_prepared, long_short=True, group_neutral=False, by_group=False ) # 保存结果 plt.savefig(f'results/{factor_name}_tear_sheet.png', dpi=300, bbox_inches='tight') except Exception as e: print(f"Error creating tear sheet: {e}") 5. Qlib集成模块 # src/qlib_integration.py import pandas as pd import numpy as np from datetime import datetime import qlib from qlib.data import D from qlib.data.dataset import DatasetH from qlib.data.dataset.handler import DataHandlerLP from qlib.contrib.data.handler import Alpha158 class QlibAdapter: """Qlib适配器,用于更复杂的因子研究和回测""" def __init__(self, provider_uri='~/.qlib/qlib_data/cn_data'): # 初始化Qlib qlib.init(provider_uri=provider_uri) def create_dataset(self, instruments, start_time, end_time): """ 创建Qlib数据集 """ # 定义数据处理器 handler = Alpha158( instruments=instruments, start_time=start_time, end_time=end_time, infer_processors=[], learn_processors=["DropnaLabel", "CSZScoreNorm"], fit_processors=[] ) # 创建数据集 dataset = DatasetH( handler=handler, segments={ "train": (start_time, "2020-12-31"), "valid": ("2021-01-01", "2021-06-30"), "test": ("2021-07-01", end_time), } ) return dataset def custom_factor_research(self, factor_data, labels): """ 使用Qlib进行因子研究 """ from qlib.contrib.evaluate import risk_analysis from qlib.contrib.strategy import TopkDropoutStrategy # 准备因子数据 factor_df = factor_data.copy() factor_df['datetime'] = pd.to_datetime(factor_df.index.get_level_values(0)) factor_df['instrument'] = factor_df.index.get_level_values(1) # 计算因子IC ic_results = self.calculate_ic(factor_df, labels) return ic_results def calculate_ic(self, factor_df, returns_df): """ 计算因子IC """ # 对齐因子和收益数据 merged_data = pd.merge( factor_df.reset_index(), returns_df.reset_index(), on=['datetime', 'instrument'] ) # 计算横截面IC ic_series = merged_data.groupby('datetime').apply( lambda x: x['factor'].corr(x['return']) ) return { 'IC_mean': ic_series.mean(), 'IC_std': ic_series.std(), 'IC_IR': ic_series.mean() / ic_series.std() if ic_series.std() > 0 else 0, 'IC_series': ic_series } 6. 主程序示例 # main.py import pandas as pd import numpy as np from datetime import datetime, timedelta import matplotlib.pyplot as plt import warnings warnings.filterwarnings('ignore') from src.data_loader import AShareDataLoader from src.factor_builder import FactorFactory from src.alphalens_analysis import AlphaLensAnalyzer from src.qlib_integration import QlibAdapter def main(): # 初始化 data_loader = AShareDataLoader('data/raw/') factor_builder = FactorFactory() analyzer = AlphaLensAnalyzer() # 1. 加载本地Excel数据 print("加载本地Excel数据...") df = data_loader.load_excel_data('data/raw/A股日线数据.xlsx') # 展示数据 print(f"数据形状: {df.shape}") print(f"时间范围: {df['date'].min()} 到 {df['date'].max()}") print(f"股票数量: {df['code'].nunique()}") print("\n数据样例:") print(df.head()) # 2. 创建面板数据 print("\n创建面板数据...") prices = df.pivot_table( index='date', columns='code', values='close' ) volumes = df.pivot_table( index='date', columns='code', values='volume' ) # 3. 构建因子 print("\n构建量化因子...") # 动量因子 momentum_factor = factor_builder.momentum_factor(prices, lookback_period=20) # 均值回复因子 mean_rev_factor = factor_builder.mean_reversion_factor(prices, lookback_period=20) # 技术指标因子 tech_factors = factor_builder.technical_factors(df) # 4. 准备Alphalens数据 print("\n准备Alphalens分析数据...") # 重塑因子数据为Alphalens需要的格式 momentum_factor_flat = momentum_factor.stack() momentum_factor_flat.name = 'momentum' momentum_factor_flat.index.names = ['date', 'asset'] # 5. 运行Alphalens分析 print("运行Alphalens分析...") factor_data = analyzer.prepare_factor_data( momentum_factor_flat, prices, quantiles=5, periods=(1, 5, 10, 20) ) # 6. 生成分析报告 print("生成因子分析报告...") results = analyzer.run_full_analysis(factor_data, "Momentum_Factor") # 7. 保存结果 print("保存分析结果...") results_df = pd.DataFrame({ 'IC_mean': [results['ic'].mean()], 'IC_std': [results['ic'].std()], 'IC_IR': [results['ic'].mean() / results['ic'].std() if results['ic'].std() > 0 else 0], 'Top_quantile_return': [results['mean_return'].iloc[-1].mean()], 'Bottom_quantile_return': [results['mean_return'].iloc[0].mean()] }) results_df.to_csv('results/factor_performance.csv', index=False) # 8. 多因子分析示例 print("\n进行多因子分析...") factors_dict = { 'Momentum': momentum_factor, 'Mean_Reversion': mean_rev_factor, 'RSI': tech_factors['RSI'].unstack() if 'RSI' in tech_factors.columns else None } # 分析每个因子 for factor_name, factor_values in factors_dict.items(): if factor_values is not None: try: factor_flat = factor_values.stack() factor_flat.name = factor_name.lower() factor_flat.index.names = ['date', 'asset'] factor_data = analyzer.prepare_factor_data(factor_flat, prices) analyzer.run_full_analysis(factor_data, factor_name) except Exception as e: print(f"分析因子 {factor_name} 时出错: {e}") print("\n分析完成!") if __name__ == "__main__": main() 7. Jupyter Notebook示例 # notebooks/factor_analysis.ipynb """ # A股量化因子分析实例 本笔记本演示如何使用Alphalens、FinMind和Qlib分析A股因子 """ # 导入必要的库 import sys sys.path.append('../') import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from datetime import datetime # 设置中文显示 plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False # 导入自定义模块 from src.data_loader import AShareDataLoader from src.factor_builder import FactorFactory from src.alphalens_analysis import AlphaLensAnalyzer # ## 1. 数据加载 data_loader = AShareDataLoader() # 加载示例数据 df = data_loader.load_multiple_excel('../data/raw/') # 查看数据基本信息 print("数据基本信息:") print(f"时间范围: {df['date'].min()} 到 {df['date'].max()}") print(f"股票数量: {df['code'].nunique()}") print(f"总记录数: {len(df)}") # ## 2. 数据预处理 # 创建价格面板 prices = df.pivot_table(index='date', columns='code', values='close') volumes = df.pivot_table(index='date', columns='code', values='volume') # 处理缺失值 prices = prices.ffill().bfill() volumes = volumes.ffill().bfill() # ## 3. 因子计算 factor_builder = FactorFactory() # 计算动量因子 momentum_20d = factor_builder.momentum_factor(prices, 20) # 计算波动率因子 volatility_20d = factor_builder.volatility_factor(prices, 20) # 计算技术指标 tech_factors = factor_builder.technical_factors(df) # ## 4. Alphalens分析 analyzer = AlphaLensAnalyzer() # 准备动量因子数据 momentum_flat = momentum_20d.stack() momentum_flat.name = 'momentum' momentum_flat.index.names = ['date', 'asset'] # 运行分析 factor_data = analyzer.prepare_factor_data(momentum_flat, prices) results = analyzer.run_full_analysis(factor_data, "20日动量因子") # ## 5. 因子组合分析 # 创建多因子得分 factor_scores = pd.DataFrame(index=prices.index) for code in prices.columns: # 标准化各个因子 momentum_z = (momentum_20d[code] - momentum_20d[code].mean()) / momentum_20d[code].std() if 'RSI' in tech_factors.columns: rsi = tech_factors[tech_factors.index.get_level_values(1) == code]['RSI'] if not rsi.empty: rsi_z = (rsi - rsi.mean()) / rsi.std() # 综合得分(示例:动量 + 反转) factor_scores[code] = 0.6 * momentum_z + 0.4 * (-rsi_z) # RSI反转 # ## 6. 回测分析 def backtest_factor(factor_scores, prices, top_n=10, hold_period=5): """ 简单回测函数 """ returns = prices.pct_change() positions = pd.DataFrame(0, index=factor_scores.index, columns=factor_scores.columns) for i in range(hold_period, len(factor_scores), hold_period): current_date = factor_scores.index[i] # 选择前N个股票 top_stocks = factor_scores.loc[current_date].nlargest(top_n).index # 持有这些股票 for j in range(hold_period): if i + j < len(positions): positions.iloc[i + j][top_stocks] = 1 # 计算组合收益 portfolio_returns = (positions.shift(1) * returns).sum(axis=1) / top_n return portfolio_returns # 运行回测 portfolio_returns = backtest_factor(factor_scores, prices) cumulative_returns = (1 + portfolio_returns).cumprod() # 绘制回测结果 fig, axes = plt.subplots(2, 1, figsize=(12, 10)) # 累计收益 axes[0].plot(cumulative_returns.index, cumulative_returns.values) axes[0].set_title('组合累计收益') axes[0].set_ylabel('累计收益') axes[0].grid(True) # 月度收益分布 monthly_returns = portfolio_returns.resample('M').apply(lambda x: (1 + x).prod() - 1) axes[1].bar(monthly_returns.index.strftime('%Y-%m'), monthly_returns.values) axes[1].set_title('月度收益') axes[1].set_ylabel('月度收益') plt.xticks(rotation=45) plt.tight_layout() plt.show() 8. 配置文件 # config.py import os from datetime import datetime # 数据路径配置 DATA_CONFIG = { 'raw_data_path': 'data/raw/', 'processed_data_path': 'data/processed/', 'features_path': 'data/features/', 'results_path': 'results/' } # 股票池配置 STOCK_UNIVERSE = { '沪深300': ['000300.SH'], # 可以使用指数成分股 '中证500': ['000905.SH'], '全A股': 'all' # 使用所有可用股票 } # 因子参数配置 FACTOR_PARAMS = { 'momentum_periods': [5, 10, 20, 60], 'mean_reversion_periods': [5, 10, 20], 'volatility_window': 20, 'liquidity_window': 5 } # 回测参数 BACKTEST_PARAMS = { 'start_date': '2020-01-01', 'end_date': '2023-12-31', 'initial_capital': 1000000, 'transaction_cost': 0.001, # 千分之一交易成本 'top_n': 10, # 每次选股数量 'hold_period': 20 # 持有期(交易日) } # 创建必要的目录 for path in DATA_CONFIG.values(): os.makedirs(path, exist_ok=True) 使用说明 数据准备: 将A股日线数据保存为Excel格式,包含必要字段 数据放在data/raw/目录下 安装依赖: pip install -r requirements.txt 运行分析: # 运行主程序 python main.py # 或使用Jupyter Notebook jupyter notebook notebooks/factor_analysis.ipynb 自定义因子: 在factor_builder.py中添加新的因子计算方法 在main.py中调用新因子进行分析
这个实例提供了完整的框架,可以根据需要: 添加更多因子类型 集成更多数据源 优化回测逻辑 添加风险管理模块 实现自动化交易信号生成

上一篇    
《趋势跟踪》 笔记

因子量化投资开源库

因子与量化投资联系和区别