Alphalens FinMind Qlib 实例
所属分类 quant
浏览量 7
Alphalens FinMind Qlib 实例
基于A股数据 本地excel日线数据
项目架构设计
project/
├── data/
│ ├── raw/ # 原始Excel数据
│ ├── processed/ # 处理后的数据
│ └── features/ # 特征数据
├── notebooks/ # Jupyter Notebook分析
├── src/
│ ├── data_loader.py # 数据加载模块
│ ├── factor_builder.py # 因子构建
│ ├── backtest.py # 回测模块
│ └── utils.py # 工具函数
├── config.py # 配置文件
└── requirements.txt # 依赖包
1. 环境准备
python
# requirements.txt
pandas>=1.5.0
numpy>=1.23.0
alphalens>=0.4.0
empyrical>=0.5.0
matplotlib>=3.6.0
seaborn>=0.12.0
qlib>=0.9.0
finmind>=2.0.0
openpyxl>=3.0.0
jupyter>=1.0.0
2. 数据加载模块
# src/data_loader.py
import pandas as pd
import numpy as np
import os
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')
class AShareDataLoader:
"""A股数据加载器,支持本地Excel和FinMind API"""
def __init__(self, data_path='data/raw/'):
self.data_path = data_path
def load_excel_data(self, file_path, sheet_name='日线数据'):
"""
加载本地Excel日线数据
假设Excel格式包含以下列:
['date', 'code', 'open', 'high', 'low', 'close', 'volume', 'turnover']
"""
df = pd.read_excel(file_path, sheet_name=sheet_name)
# 标准化列名
column_mapping = {
'日期': 'date',
'股票代码': 'code',
'开盘价': 'open',
'最高价': 'high',
'最低价': 'low',
'收盘价': 'close',
'成交量': 'volume',
'成交额': 'turnover'
}
df.rename(columns=column_mapping, inplace=True)
# 确保日期格式
df['date'] = pd.to_datetime(df['date'])
# 标准化股票代码格式 (600001.SH)
df['code'] = df['code'].astype(str)
df['code'] = df['code'].apply(lambda x: self._standardize_code(x))
return df
def load_multiple_excel(self, folder_path):
"""加载文件夹中的所有Excel文件"""
all_data = []
for file in os.listdir(folder_path):
if file.endswith(('.xlsx', '.xls')):
file_path = os.path.join(folder_path, file)
df = self.load_excel_data(file_path)
all_data.append(df)
if all_data:
return pd.concat(all_data, ignore_index=True)
else:
raise ValueError(f"No Excel files found in {folder_path}")
def _standardize_code(self, code):
"""标准化A股代码格式"""
code = str(code).strip()
if len(code) == 6:
if code.startswith(('6', '9')):
return f"{code}.SH"
elif code.startswith(('0', '3')):
return f"{code}.SZ"
elif code.startswith('4'):
return f"{code}.BJ" # 北交所
return code
def fetch_finmind_data(self, stock_codes, start_date, end_date):
"""
从FinMind API获取补充数据
"""
try:
from finmind.data import DataLoader
api = DataLoader()
all_data = []
for code in stock_codes:
try:
# 移除后缀获取原始代码
raw_code = code.split('.')[0]
# 获取日线数据
df = api.taiwan_stock_daily(
stock_id=raw_code,
start_date=start_date,
end_date=end_date
)
if not df.empty:
df['code'] = code
all_data.append(df)
except Exception as e:
print(f"Error fetching data for {code}: {e}")
continue
if all_data:
combined_df = pd.concat(all_data, ignore_index=True)
# 重命名列以匹配我们的格式
column_mapping = {
'date': 'date',
'stock_id': 'code',
'open': 'open',
'max': 'high',
'min': 'low',
'close': 'close',
'Trading_Volume': 'volume',
'Trading_money': 'turnover'
}
combined_df.rename(columns=column_mapping, inplace=True)
return combined_df
except ImportError:
print("FinMind not installed. Using local data only.")
return None
def create_panel_data(self, df, price_col='close'):
"""
创建面板数据格式,为Alphalens准备
"""
# 设置多级索引 (date, code)
df_panel = df.set_index(['date', 'code'])[[price_col]]
df_panel = df_panel.unstack().swaplevel(axis=1).sort_index(axis=1)
return df_panel
3. 因子构建模块
# src/factor_builder.py
import pandas as pd
import numpy as np
from scipy import stats
class FactorFactory:
"""因子工厂类,构建各种量化因子"""
@staticmethod
def momentum_factor(prices, lookback_period=20):
"""动量因子:过去N日的收益率"""
return prices.pct_change(lookback_period)
@staticmethod
def mean_reversion_factor(prices, lookback_period=20):
"""均值回复因子:价格与均值的偏离度"""
ma = prices.rolling(window=lookback_period).mean()
return (prices - ma) / ma
@staticmethod
def volume_factor(prices, volumes, lookback_period=10):
"""量价因子:价格变化与成交量的相关性"""
returns = prices.pct_change()
volume_change = volumes.pct_change()
corr = returns.rolling(window=lookback_period).corr(volume_change)
return corr
@staticmethod
def volatility_factor(prices, lookback_period=20):
"""波动率因子:历史波动率"""
returns = prices.pct_change()
volatility = returns.rolling(window=lookback_period).std()
return volatility
@staticmethod
def liquidity_factor(volumes, prices, lookback_period=5):
"""流动性因子:平均换手率"""
turnover = volumes / prices # 简化版换手率
avg_turnover = turnover.rolling(window=lookback_period).mean()
return avg_turnover
@staticmethod
def technical_factors(df):
"""技术指标因子集合"""
factors = {}
# 计算收盘价相关因子
close = df['close']
high = df['high']
low = df['low']
volume = df['volume']
# RSI (相对强弱指数)
delta = close.diff()
gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
rs = gain / loss
factors['RSI'] = 100 - (100 / (1 + rs))
# MACD
exp1 = close.ewm(span=12, adjust=False).mean()
exp2 = close.ewm(span=26, adjust=False).mean()
macd = exp1 - exp2
signal = macd.ewm(span=9, adjust=False).mean()
factors['MACD'] = macd - signal
# 布林带位置
ma = close.rolling(window=20).mean()
std = close.rolling(window=20).std()
factors['BB_position'] = (close - ma) / (2 * std)
return pd.DataFrame(factors)
4. Alphalens分析模块
# src/alphalens_analysis.py
import alphalens as al
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
class AlphaLensAnalyzer:
"""使用Alphalens进行因子分析"""
def __init__(self):
plt.style.use('seaborn-v0_8-darkgrid')
def prepare_factor_data(self, factor_data, prices,
quantiles=5, periods=(1, 5, 10, 20)):
"""
准备因子数据供Alphalens分析
参数:
factor_data: DataFrame, 因子值,索引为(date, asset)
prices: DataFrame, 价格数据,索引为date,列为asset
quantiles: int, 分位数数量
periods: tuple, 持有期列表
"""
# 确保时间索引正确
factor_data.index = pd.MultiIndex.from_tuples(
[(pd.Timestamp(x[0]), x[1]) for x in factor_data.index]
)
# 对齐价格数据
prices = prices.loc[factor_data.index.get_level_values(0).unique()]
# 创建因子数据
factor = al.utils.get_clean_factor_and_forward_returns(
factor=factor_data,
prices=prices,
quantiles=quantiles,
periods=periods,
max_loss=0.35
)
return factor
def run_full_analysis(self, factor_data, factor_name="Factor"):
"""
运行完整的因子分析
"""
# 1. 分位数分组分析
mean_return_by_q, std_err_by_q = al.performance.mean_return_by_quantile(
factor_data, by_group=False
)
# 2. 计算IC(信息系数)
ic = al.performance.factor_information_coefficient(factor_data)
# 3. 分位数换手率
turnover = al.performance.quantile_turnover(factor_data)
# 创建可视化
self.create_visualizations(
factor_data, mean_return_by_q, ic, turnover, factor_name
)
return {
'mean_return': mean_return_by_q,
'ic': ic,
'turnover': turnover
}
def create_visualizations(self, factor_data, mean_returns, ic, turnover, factor_name):
"""
创建分析图表
"""
fig, axes = plt.subplots(3, 2, figsize=(16, 18))
# 1. 分位数收益热图
al.plotting.plot_quantile_returns_bar(mean_returns, ax=axes[0, 0])
axes[0, 0].set_title(f'{factor_name} - Quantile Returns')
# 2. 分位数收益时间序列
al.plotting.plot_cumulative_returns_by_quantile(
mean_returns, period=5, ax=axes[0, 1]
)
axes[0, 1].set_title(f'{factor_name} - Cumulative Returns by Quantile')
# 3. IC分析
al.plotting.plot_ic_ts(ic, ax=axes[1, 0])
axes[1, 0].set_title(f'{factor_name} - Information Coefficient')
al.plotting.plot_ic_hist(ic, ax=axes[1, 1])
axes[1, 1].set_title(f'{factor_name} - IC Distribution')
# 4. 换手率分析
al.plotting.plot_turnover_hist(turnover, ax=axes[2, 0])
axes[2, 0].set_title(f'{factor_name} - Turnover Distribution')
# 5. 分位数平均收益
al.plotting.plot_mean_quantile_returns(mean_returns, ax=axes[2, 1])
axes[2, 1].set_title(f'{factor_name} - Mean Quantile Returns')
plt.tight_layout()
plt.savefig(f'results/{factor_name}_analysis.png', dpi=300, bbox_inches='tight')
plt.show()
def create_factor_tear_sheet(self, factor_data, prices, factor_name="Custom_Factor"):
"""
创建完整的因子分析报告
"""
try:
# 准备因子数据
factor_data_prepared = self.prepare_factor_data(factor_data, prices)
# 运行Alphalens完整分析
al.tears.create_full_tear_sheet(
factor_data_prepared,
long_short=True,
group_neutral=False,
by_group=False
)
# 保存结果
plt.savefig(f'results/{factor_name}_tear_sheet.png',
dpi=300, bbox_inches='tight')
except Exception as e:
print(f"Error creating tear sheet: {e}")
5. Qlib集成模块
# src/qlib_integration.py
import pandas as pd
import numpy as np
from datetime import datetime
import qlib
from qlib.data import D
from qlib.data.dataset import DatasetH
from qlib.data.dataset.handler import DataHandlerLP
from qlib.contrib.data.handler import Alpha158
class QlibAdapter:
"""Qlib适配器,用于更复杂的因子研究和回测"""
def __init__(self, provider_uri='~/.qlib/qlib_data/cn_data'):
# 初始化Qlib
qlib.init(provider_uri=provider_uri)
def create_dataset(self, instruments, start_time, end_time):
"""
创建Qlib数据集
"""
# 定义数据处理器
handler = Alpha158(
instruments=instruments,
start_time=start_time,
end_time=end_time,
infer_processors=[],
learn_processors=["DropnaLabel", "CSZScoreNorm"],
fit_processors=[]
)
# 创建数据集
dataset = DatasetH(
handler=handler,
segments={
"train": (start_time, "2020-12-31"),
"valid": ("2021-01-01", "2021-06-30"),
"test": ("2021-07-01", end_time),
}
)
return dataset
def custom_factor_research(self, factor_data, labels):
"""
使用Qlib进行因子研究
"""
from qlib.contrib.evaluate import risk_analysis
from qlib.contrib.strategy import TopkDropoutStrategy
# 准备因子数据
factor_df = factor_data.copy()
factor_df['datetime'] = pd.to_datetime(factor_df.index.get_level_values(0))
factor_df['instrument'] = factor_df.index.get_level_values(1)
# 计算因子IC
ic_results = self.calculate_ic(factor_df, labels)
return ic_results
def calculate_ic(self, factor_df, returns_df):
"""
计算因子IC
"""
# 对齐因子和收益数据
merged_data = pd.merge(
factor_df.reset_index(),
returns_df.reset_index(),
on=['datetime', 'instrument']
)
# 计算横截面IC
ic_series = merged_data.groupby('datetime').apply(
lambda x: x['factor'].corr(x['return'])
)
return {
'IC_mean': ic_series.mean(),
'IC_std': ic_series.std(),
'IC_IR': ic_series.mean() / ic_series.std() if ic_series.std() > 0 else 0,
'IC_series': ic_series
}
6. 主程序示例
# main.py
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from src.data_loader import AShareDataLoader
from src.factor_builder import FactorFactory
from src.alphalens_analysis import AlphaLensAnalyzer
from src.qlib_integration import QlibAdapter
def main():
# 初始化
data_loader = AShareDataLoader('data/raw/')
factor_builder = FactorFactory()
analyzer = AlphaLensAnalyzer()
# 1. 加载本地Excel数据
print("加载本地Excel数据...")
df = data_loader.load_excel_data('data/raw/A股日线数据.xlsx')
# 展示数据
print(f"数据形状: {df.shape}")
print(f"时间范围: {df['date'].min()} 到 {df['date'].max()}")
print(f"股票数量: {df['code'].nunique()}")
print("\n数据样例:")
print(df.head())
# 2. 创建面板数据
print("\n创建面板数据...")
prices = df.pivot_table(
index='date',
columns='code',
values='close'
)
volumes = df.pivot_table(
index='date',
columns='code',
values='volume'
)
# 3. 构建因子
print("\n构建量化因子...")
# 动量因子
momentum_factor = factor_builder.momentum_factor(prices, lookback_period=20)
# 均值回复因子
mean_rev_factor = factor_builder.mean_reversion_factor(prices, lookback_period=20)
# 技术指标因子
tech_factors = factor_builder.technical_factors(df)
# 4. 准备Alphalens数据
print("\n准备Alphalens分析数据...")
# 重塑因子数据为Alphalens需要的格式
momentum_factor_flat = momentum_factor.stack()
momentum_factor_flat.name = 'momentum'
momentum_factor_flat.index.names = ['date', 'asset']
# 5. 运行Alphalens分析
print("运行Alphalens分析...")
factor_data = analyzer.prepare_factor_data(
momentum_factor_flat,
prices,
quantiles=5,
periods=(1, 5, 10, 20)
)
# 6. 生成分析报告
print("生成因子分析报告...")
results = analyzer.run_full_analysis(factor_data, "Momentum_Factor")
# 7. 保存结果
print("保存分析结果...")
results_df = pd.DataFrame({
'IC_mean': [results['ic'].mean()],
'IC_std': [results['ic'].std()],
'IC_IR': [results['ic'].mean() / results['ic'].std() if results['ic'].std() > 0 else 0],
'Top_quantile_return': [results['mean_return'].iloc[-1].mean()],
'Bottom_quantile_return': [results['mean_return'].iloc[0].mean()]
})
results_df.to_csv('results/factor_performance.csv', index=False)
# 8. 多因子分析示例
print("\n进行多因子分析...")
factors_dict = {
'Momentum': momentum_factor,
'Mean_Reversion': mean_rev_factor,
'RSI': tech_factors['RSI'].unstack() if 'RSI' in tech_factors.columns else None
}
# 分析每个因子
for factor_name, factor_values in factors_dict.items():
if factor_values is not None:
try:
factor_flat = factor_values.stack()
factor_flat.name = factor_name.lower()
factor_flat.index.names = ['date', 'asset']
factor_data = analyzer.prepare_factor_data(factor_flat, prices)
analyzer.run_full_analysis(factor_data, factor_name)
except Exception as e:
print(f"分析因子 {factor_name} 时出错: {e}")
print("\n分析完成!")
if __name__ == "__main__":
main()
7. Jupyter Notebook示例
# notebooks/factor_analysis.ipynb
"""
# A股量化因子分析实例
本笔记本演示如何使用Alphalens、FinMind和Qlib分析A股因子
"""
# 导入必要的库
import sys
sys.path.append('../')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
# 设置中文显示
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 导入自定义模块
from src.data_loader import AShareDataLoader
from src.factor_builder import FactorFactory
from src.alphalens_analysis import AlphaLensAnalyzer
# ## 1. 数据加载
data_loader = AShareDataLoader()
# 加载示例数据
df = data_loader.load_multiple_excel('../data/raw/')
# 查看数据基本信息
print("数据基本信息:")
print(f"时间范围: {df['date'].min()} 到 {df['date'].max()}")
print(f"股票数量: {df['code'].nunique()}")
print(f"总记录数: {len(df)}")
# ## 2. 数据预处理
# 创建价格面板
prices = df.pivot_table(index='date', columns='code', values='close')
volumes = df.pivot_table(index='date', columns='code', values='volume')
# 处理缺失值
prices = prices.ffill().bfill()
volumes = volumes.ffill().bfill()
# ## 3. 因子计算
factor_builder = FactorFactory()
# 计算动量因子
momentum_20d = factor_builder.momentum_factor(prices, 20)
# 计算波动率因子
volatility_20d = factor_builder.volatility_factor(prices, 20)
# 计算技术指标
tech_factors = factor_builder.technical_factors(df)
# ## 4. Alphalens分析
analyzer = AlphaLensAnalyzer()
# 准备动量因子数据
momentum_flat = momentum_20d.stack()
momentum_flat.name = 'momentum'
momentum_flat.index.names = ['date', 'asset']
# 运行分析
factor_data = analyzer.prepare_factor_data(momentum_flat, prices)
results = analyzer.run_full_analysis(factor_data, "20日动量因子")
# ## 5. 因子组合分析
# 创建多因子得分
factor_scores = pd.DataFrame(index=prices.index)
for code in prices.columns:
# 标准化各个因子
momentum_z = (momentum_20d[code] - momentum_20d[code].mean()) / momentum_20d[code].std()
if 'RSI' in tech_factors.columns:
rsi = tech_factors[tech_factors.index.get_level_values(1) == code]['RSI']
if not rsi.empty:
rsi_z = (rsi - rsi.mean()) / rsi.std()
# 综合得分(示例:动量 + 反转)
factor_scores[code] = 0.6 * momentum_z + 0.4 * (-rsi_z) # RSI反转
# ## 6. 回测分析
def backtest_factor(factor_scores, prices, top_n=10, hold_period=5):
"""
简单回测函数
"""
returns = prices.pct_change()
positions = pd.DataFrame(0, index=factor_scores.index, columns=factor_scores.columns)
for i in range(hold_period, len(factor_scores), hold_period):
current_date = factor_scores.index[i]
# 选择前N个股票
top_stocks = factor_scores.loc[current_date].nlargest(top_n).index
# 持有这些股票
for j in range(hold_period):
if i + j < len(positions):
positions.iloc[i + j][top_stocks] = 1
# 计算组合收益
portfolio_returns = (positions.shift(1) * returns).sum(axis=1) / top_n
return portfolio_returns
# 运行回测
portfolio_returns = backtest_factor(factor_scores, prices)
cumulative_returns = (1 + portfolio_returns).cumprod()
# 绘制回测结果
fig, axes = plt.subplots(2, 1, figsize=(12, 10))
# 累计收益
axes[0].plot(cumulative_returns.index, cumulative_returns.values)
axes[0].set_title('组合累计收益')
axes[0].set_ylabel('累计收益')
axes[0].grid(True)
# 月度收益分布
monthly_returns = portfolio_returns.resample('M').apply(lambda x: (1 + x).prod() - 1)
axes[1].bar(monthly_returns.index.strftime('%Y-%m'), monthly_returns.values)
axes[1].set_title('月度收益')
axes[1].set_ylabel('月度收益')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
8. 配置文件
# config.py
import os
from datetime import datetime
# 数据路径配置
DATA_CONFIG = {
'raw_data_path': 'data/raw/',
'processed_data_path': 'data/processed/',
'features_path': 'data/features/',
'results_path': 'results/'
}
# 股票池配置
STOCK_UNIVERSE = {
'沪深300': ['000300.SH'], # 可以使用指数成分股
'中证500': ['000905.SH'],
'全A股': 'all' # 使用所有可用股票
}
# 因子参数配置
FACTOR_PARAMS = {
'momentum_periods': [5, 10, 20, 60],
'mean_reversion_periods': [5, 10, 20],
'volatility_window': 20,
'liquidity_window': 5
}
# 回测参数
BACKTEST_PARAMS = {
'start_date': '2020-01-01',
'end_date': '2023-12-31',
'initial_capital': 1000000,
'transaction_cost': 0.001, # 千分之一交易成本
'top_n': 10, # 每次选股数量
'hold_period': 20 # 持有期(交易日)
}
# 创建必要的目录
for path in DATA_CONFIG.values():
os.makedirs(path, exist_ok=True)
使用说明
数据准备:
将A股日线数据保存为Excel格式,包含必要字段
数据放在data/raw/目录下
安装依赖:
pip install -r requirements.txt
运行分析:
# 运行主程序
python main.py
# 或使用Jupyter Notebook
jupyter notebook notebooks/factor_analysis.ipynb
自定义因子:
在factor_builder.py中添加新的因子计算方法
在main.py中调用新因子进行分析
这个实例提供了完整的框架,可以根据需要:
添加更多因子类型
集成更多数据源
优化回测逻辑
添加风险管理模块
实现自动化交易信号生成
上一篇
《趋势跟踪》 笔记
因子量化投资开源库
因子与量化投资联系和区别