001.2_如何用Python下载全部数据.py

"""
AB量化，让数据驱动。【本程序可能会与时更新，请经常登录：abtrue.com  进行查阅】
作者微信：chaoxian102  邮箱：abtrue@hotmail.com  X：https://x.com/@choxin17384
为兼容win7系统，本站所有程序运行环境均为Python3.8版本
为了易于学习和使用，本站每个程序均用一个文件独立呈现，并且只用Python自带标准库
支持打赏，PayPal：https://paypal.me/abtruecom  支付宝：abtrue@hotmail.com
USDT：0x74c86D2f7c7096cFBb9Ec8D373a586E62176b62A (ETH:ERC20)
      3HvEcHBrSKWauAVG36zcUoxZtgegy4vijd (BTC:OMNI)
      TPjJF1Hd953ehUPVKLTFDhzPe3YyaSSFVR (Tron:TRC10/TRC20)
丰俭自行，非常感谢。
"""

# -*- coding: UTF-8 -*-

from urllib import request, parse
from urllib.parse import quote
import urllib.parse

import json
import time
import datetime
import io
import os
import shutil
import sys
import hashlib
import hmac
import base64
import random
import zipfile
import zlib
import gzip


#----------------时间到强制结束线程，适合多线程并行请求非本地自控资源时
import threading
import inspect
import ctypes

def _async_raise(tid, exctype):
    """raises the exception, performs cleanup if needed"""
    tid = ctypes.c_long(tid)
    if not inspect.isclass(exctype):
        exctype = type(exctype)
    res = ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, ctypes.py_object(exctype))
    if res == 0:
        raise ValueError("invalid thread id")
    elif res != 1:
        # """if it returns a number greater than one, you're in trouble,
        # and you should call it again with exc=NULL to revert the effect"""
        ctypes.pythonapi.PyThreadState_SetAsyncExc(tid, None)
        raise SystemError("PyThreadState_SetAsyncExc failed")
 
def stop_thread(thread):
    _async_raise(thread.ident, SystemExit)
#================


f0=lambda x: 0.0 if x=="" or x==None else float(x)  # 把字符串转为浮点数


def get_htmll(urll):
    headers = {'user-agent': 'Python3.8'}
    
    try:
        req = request.Request(urll, headers=headers)
        with request.urlopen(req, timeout=30) as resp:
            # 获取响应头中的 Content-Encoding 字段
            content_encoding = resp.getheader("Content-Encoding")

            # 检查是否启用了 Gzip 压缩
            if content_encoding and "gzip" in content_encoding:
                # 对内容进行解压缩
                compressed_data = resp.read()
                decompressed_data = gzip.decompress(compressed_data)
            else:
                # 直接获取内容
                decompressed_data = resp.read()
                
        # 对内容进行解码，并保存或返回
        #htmll=decompressed_data.decode("GBK","ignore")
        htmll=decompressed_data.decode("utf-8","replace")
        #with open('_OK.txt', 'a', encoding='utf-8', newline='\r\n') as f:
            #f.write(htmll)
        return htmll
    except Exception as e:
        print(urll)
        print(e)
        htmll=""
        #with open(f'_ERR.txt', 'a', encoding='utf-8', newline='\r\n') as f:
            #f.write(urll+"\r\n"+e+"\r\n\r\n")
        return htmll


#----------------获取交易代码信息
#urll=f"https://www.abtrue.com/vip/cc0ea904809701d8.html"  # 沪深A股主板股票
urll=f"https://www.abtrue.com/vip/c26501c419af8da3.html"  # 加密货币交易对
# 请自行在 www.abtrue.com 会员浏览 页面选择要批量下载数据的网址
# 注意，网址要包含最终数据链接，而非分类页面
ddd=[]
htmll=get_htmll(urll)
p_l=0
p_r=0
while (p_l:=htmll.find("href='/data/",p_l))>-1:  # 循环获取交易代码信息
    p_l+=len("href='/data/")
    p_r=htmll.find("/",p_l)
    a=htmll[p_l:p_r]
    nation=a[:-1]
    adj=a[-1]
    # 直接规定 adj=0 为不复权，adj=1 为前复权，adj=2 为后复权
    # 加密货币只有不复权，即 adj=0
    p_l=p_r+1
    p_r=htmll.find(".html",p_l)
    b=htmll[p_l:p_r]
    b=b.split("_")
    sym=b[0]
    freq=b[1]
    ddd.append([nation,adj,sym,freq])

print(f"交易代码共：{len(ddd)}")
#================


#----------------获取交易数据，并json格式保存到本地
while ddd!=[]:
    time.sleep(0.1)  # 个人站点资源有限，望共同维护
    dd=ddd[-1]
    nation=dd[0]; adj=dd[1]; sym=dd[2]; freq=dd[3]
    p_file=f"data/{nation}{adj}/{sym}_{freq}.txt"
    
    if os.path.exists(p_file):  # 请不要人为修改已保存的json文件
        with open(p_file, 'r', encoding='utf-8-sig', newline='\r\n') as f:
            uuu=json.loads(f.read())
        if uuu==[] or len(uuu)==1:
            start=""
        else:
            start=uuu[-2][0]
    else:
        # 创建目标文件夹的子文件夹，第一次会创建，以后可注销下面两行
        target_subfolder = os.path.dirname(p_file)
        os.makedirs(target_subfolder, exist_ok=True)
        start=""

    while True:
        print(dd,start)
        try:
            urll=(f"https://www.abtrue.com:5121/down_data.py?nation={nation}"
                  f"&adj={adj}&sym={sym}&freq={freq}&start={start}")
            htmll=get_htmll(urll)
            vvv=json.loads(htmll)  # 不能解析时会抛出异常
            # vvv json格式字段说明：
            # ["0交易时间/日期", 1前收, 2开盘, 3最低, 4最高, 5收盘,
            # 6成交量, 7成交额, 8流通换手率]，其他数据可由这些原始数据计算生成
            # 小数（包括百分比）的计算，由于计算机与人类进制的区别，及各平台
            # 选用的算法不同，导致结果会有细微差别，我们直接选用 round 函数，
            # 从统计学上更公平和准确
            
            if start=="":
                with open(p_file, 'w', encoding='utf-8', newline='\r\n') as f:
                    f.write(json.dumps(vvv, indent=4, ensure_ascii=False)
                            +"\r\n")
                ddd.pop()  # 删除最后一个
                break
            else:
                i=uuu.index(vvv[0])  # 找不到会抛出异常
                uuu=uuu[:i]
                uuu.extend(vvv)
                with open(p_file, 'w', encoding='utf-8', newline='\r\n') as f:
                    f.write(json.dumps(uuu, indent=4, ensure_ascii=False)
                            +"\r\n")
                ddd.pop()  # 删除最后一个
                break
        except Exception as e:
                start=""
                print(e)
                print("--Err--")
#================


print("--end--")
返回顶部
量化世界，世界量化中。
作者微信：chaoxian102 邮箱：abtrue@hotmail.com X：https://x.com/@choxin17384
个人站点，支持打赏，PayPal：https://paypal.me/abtruecom 支付宝：abtrue@hotmail.com
USDT：0x74c86D2f7c7096cFBb9Ec8D373a586E62176b62A (ETH:ERC20) 3HvEcHBrSKWauAVG36zcUoxZtgegy4vijd (BTC:OMNI) TPjJF1Hd953ehUPVKLTFDhzPe3YyaSSFVR (Tron:TRC10/TRC20)
丰俭自行，非常感谢。
-- BOTTOM --