value_a=np.around(np.random.normal(0,1, (batch_size, col)), decimals=5, out=None) df_feature=pd.DataFrame(value_a,columns=[f"x{i}"foriinrange(col)]) ifwith_label: df_y=pd.DataFrame(np.random.choice(2, batch_size),dtype=np.int64,columns=["y"]) one_iter_data=pd.concat([...
下面的程序就是测试使用追加保存的方式,第一次写入一千万记录,第二次再追加一千万记录,测试性能。并使用不断的压缩算法下的测试性能。 importosimporttimeimportnumpyasnpimportpandasaspd# 生成随机数据defgenerate_random_data(num_records=1000000):data={'id':np.arange(num_records),'name':np.random.choice([...
"y", "z"], 1000), ...: "b": np.random.choice(["e", "f", "g"], 1000), ...: "c": np.random.randn(1000), ...: "d": np.random.randn(1000) - 1, ...: }, ...: ) ...: In [39]: data.plot.hist(by=["a", "b"], figsize=(10, 5)); ### 箱线...
return random_dtdef generate_data(n=1000): items = [f"i_{x}" for x in range(n)] start_dates = [random_dt_bw(datetime.date(2020,1,1),datetime.date(2020,9,1)) for x in range(n)] end_dates = [x + datetime.timedelta(days=random.randint(1,10)) for x in start_dates] offer...
这个解法的前提是原来的DataFramedf_titanic里头的索引是独一无二的,另外记得设定random_state以方便别人重现你的结果。 用SQL的方式合并两个DataFrames 很多时候你会想要将两个DataFrames 依照某个共通的栏位(键值)合并成单一DataFrame 以整合资讯,比方说给定以下两个DataFrames: ...
seed(42) # Create a data frame with 5 rows and 3 columns containing random integers between 0 and 9 df = pd.DataFrame(index=range(5), columns=['RandomA', 'RandomB', 'RandomC']) # Generate random integers using apply() and a lambda function df['RandomA'] = df.apply(lambda _: ...
defgenerate_sample_data_datetime():np.random.seed(123)number_of_rows=365*2num_cols=5start_date='2023-09-15'# You can change the start dateifneeded cols=["C_0","C_1","C_2","C_3","C_4"]df=pd.DataFrame(np.random.randint(1,100,size=(number_of_rows,num_cols)),columns=cols...
random.choice(n_rows, nan_cnt, replace=False) values[index] = np.nan dataset[name] = values types[name] = 'float32' for col in range(cat_count): name = f'c{col}' cats = generate_categories() values = np.array(np.random.choice(cats, n_rows, replace=True),...
cats = generate_categories() values = np.array(np.random.choice(cats, n_rows, replace=True), dtype=object) nan_cnt = np.random.randint(1, int(max_nan*n_rows)) index = np.random.choice(n_rows, nan_cnt, replace=False) values[index] = np.nan ...
df = pd.DataFrame(np.random.randint(1,100, size = (number_or_rows, num_cols)), columns=cols) df.index = pd.date_range(start=start_date, periods=number_or_rows) returndf df=generate_sample_data_datetime 以上生成数据时间索引是以天为频率的。