sort_data.py (1731B)
1 #/usr/bin/env python3.6 2 3 import os 4 import pandas as pd 5 import numpy as np 6 from datetime import datetime 7 8 import warnings 9 warnings.simplefilter(action='ignore', category=FutureWarning) 10 11 def main(): 12 path = './data/' 13 file_names = sorted(os.listdir(path), key=lambda x: datetime.strptime(x, '%Y-%m.csv')) 14 dframes = {} 15 16 for fname in file_names: 17 dframes[fname] = np.genfromtxt(path + fname, delimiter='|', dtype='str')[1:] 18 19 cache_array = np.array([['package', 'requirement']], dtype='str') 20 21 for i, fname in enumerate(file_names): 22 23 print(fname, f'{round(i/len(file_names)*100, 1)}% {len(cache_array)}', sep='\t') 24 25 if i == 0: 26 all_before_data = dframes[fname] 27 else: 28 all_before_data = np.vstack([dframes[_] for _ in file_names[:i]]) 29 30 for j, (package, requirement) in enumerate(dframes[fname]): 31 if requirement != '': 32 if requirement not in all_before_data[:,0]: 33 cache_array = np.vstack([cache_array, [package, requirement]]) 34 dframes[fname][:,1][j] = '' 35 36 index_found = np.where(cache_array[:,1] == package)[0] 37 if index_found.size != 0: 38 for i_found in index_found: 39 found_package, found_requirement = cache_array[i_found] 40 dframes[fname] = np.vstack([dframes[fname], [found_package, found_requirement]]) 41 cache_array = np.delete(cache_array, index_found, axis=0) 42 43 44 dframes[fname] = np.unique(dframes[fname], axis=0) 45 pd.DataFrame(dframes[fname], columns=['package', 'requirement']).to_csv('./data_sorted/' + fname, sep='|', index=False) 46 47 48 49 if __name__ == '__main__': 50 main()