commit 584d9835c1e3b649920cc0bd08cd81928de688c1
parent 3d6ef55aaeb4ff183c6e3d98fd117b997da46b62
Author: miksa234 <milutin@popovic.xyz>
Date: Tue, 19 Apr 2022 10:59:07 +0200
fixed where chached items would only be printed once per found repository
Diffstat:
1 file changed, 7 insertions(+), 5 deletions(-)
diff --git a/sort_data.py b/sort_data.py
@@ -18,7 +18,7 @@ def main():
dframes[fname] = pd.read_csv(path + fname, sep='|', keep_default_na=False)
cache_list = [['package', 'requirement']]
- for i, fname in enumerate(file_names):
+ for i, fname in enumerate(file_names[:10]):
print(fname, f'{round(i/len(file_names)*100, 1)}%', sep='\t')
@@ -33,11 +33,13 @@ def main():
cache_list.append([package, requirement])
dframes[fname]['requirement'].iloc[j] = ''
- if package in list(zip(*cache_list))[1]:
- found_package, found_requirement = cache_list[list(zip(*cache_list))[1].index(package)]
- dframes[fname].append({'package': found_package, 'requirement': found_requirement},\
+ if (index_found := np.where(np.array(cache_list, dtype='object')[:,1] == package)[0]).size != 0:
+ for i_found in index_found:
+ found_package, found_requirement = cache_list[i_found]
+ dframes[fname].append({'package': found_package, 'requirement': found_requirement},\
ignore_index=True)
- del cache_list[list(zip(*cache_list))[1].index(package)]
+ del cache_list[i_found]
+
dframes[fname] = dframes[fname].drop_duplicates()
dframes[fname].to_csv('./data_sorted/' + fname, sep='|', index=False)