forked from github/CodeSearchNet
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparse_python_data.py
More file actions
executable file
·295 lines (238 loc) · 12.3 KB
/
parse_python_data.py
File metadata and controls
executable file
·295 lines (238 loc) · 12.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
#!/usr/bin/env python
"""
Acquires python data from local disk or GCP and performs parsing, cleaning and tokenization steps
to form a parallel corpus of (code, docstring) pairs with additional metadata. Processed data
is saved as multi-part jsonl files to the OUTPUT_PATH.
Usage:
parse_python_data.py [options] OUTPUT_PATH
Options:
-h --help Show this screen.
--input-folder=<path> Use the given input folder instead of downloading.
--azure-info=<path> Azure authentication information file (JSON). Used to load data from Azure storage
--debug Enable debug routines. [default: False]
"""
import re
import os
from multiprocessing import Pool
from typing import List, NamedTuple
import pandas as pd
import parso
from docopt import docopt
from dpu_utils.utils import RichPath, run_and_debug
from tqdm import tqdm
from dataextraction.utils import tokenize_docstring_from_string
from utils.pkldf2jsonl import chunked_save_df_to_jsonl
IS_WHITESPACE_REGEX = re.compile(r'\s+')
class ParsedCode(NamedTuple):
code_tokens: List[str]
comment_tokens: List[str]
def tokenize_python_from_string(code: str,
func_only: bool=True,
report_errors: bool=False,
only_ids: bool=False,
add_keywords: bool=True) -> ParsedCode:
"""
Tokenize Python code given a string.
Args:
code: The input code
func_only: if you want to only parse functions in code.
report_errors: Flag that turns on verbose error reporting
only_ids: Return only the identifiers within the code
add_keywords: Return keywords (used only when only_ids=True)
Returns:
Pair of lists. First list is sequence of code tokens; second list is sequence of tokens in comments.
"""
try:
try:
parsed_ast = parso.parse(code, error_recovery=False, version="2.7")
except parso.parser.ParserSyntaxError:
parsed_ast = parso.parse(code, error_recovery=False, version="3.7")
code_tokens, comment_tokens = [], []
func_nodes = list(parsed_ast.iter_funcdefs())
# parse arbitrary snippets of code that are not functions if func_only = False
if not func_only:
func_nodes = [parsed_ast]
for func_node in func_nodes: # There should only be one, but we can process more...
doc_node = func_node.get_doc_node()
leaf_node = func_node.get_first_leaf()
while True:
# Skip over the docstring:
if leaf_node is doc_node:
leaf_node = leaf_node.get_next_leaf()
# First, retrieve comment tokens:
for prefix in leaf_node._split_prefix():
if prefix.type == 'comment':
comment_text = prefix.value[1:] # Split off the leading "#"
comment_tokens.extend(tokenize_docstring_from_string(comment_text))
# Second, stop if we've reached the end:
if leaf_node.type == 'endmarker':
break
# Third, record code tokens:
if not(IS_WHITESPACE_REGEX.match(leaf_node.value)):
if only_ids:
if leaf_node.type == 'name':
code_tokens.append(leaf_node.value)
else:
if leaf_node.type == 'keyword':
if add_keywords:
code_tokens.append(leaf_node.value)
else:
code_tokens.append(leaf_node.value)
leaf_node = leaf_node.get_next_leaf()
return ParsedCode(code_tokens=code_tokens, comment_tokens=comment_tokens)
except Exception as e:
if report_errors:
print('Error tokenizing: %s' % (e,))
return ParsedCode(code_tokens=[], comment_tokens=[])
def download_files_into_pandas(i: int=10) -> pd.DataFrame:
"""Get files from Google Cloud Platform, there are 10 files.
Args:
i : int between 1 and 10 that specifies how many of the 10 files you
want to download. You should only use this argument for testing.
Files are obtained by this query: https://console.cloud.google.com/bigquery?sq=235037502967:58a5d62f75f34d22b0f70d38b9352a85
"""
frames = []
for i in tqdm(range(i), total=i):
success = False
while not success:
try:
frame = pd.read_csv(f'https://storage.googleapis.com/kubeflow-examples/code_search_new/python_raw_v2/00000000000{i}.csv', encoding='utf-8')
frames.append(frame)
success = True
except Exception as e:
print(f'Error downloading file {i}:\n {e}, retrying...')
df = pd.concat(frames)
df['repo'] = df['repo_path'].apply(lambda r: r.split()[0])
df['path'] = df['repo_path'].apply(lambda r: r.split()[1])
df.drop(columns=['repo_path'], inplace=True)
df = df[['repo', 'path', 'content']]
return df
def load_files_into_pandas(input_folder: str) -> pd.DataFrame:
"""Get files from a local directory.
Args:
input_folder: the folder containing the .csv files
"""
frames = []
for file in os.listdir(input_folder):
if not file.endswith('.csv'):
continue
frame = pd.read_csv(os.path.join(input_folder, file), encoding='utf-8')
frames.append(frame)
df = pd.concat(frames)
df['repo'] = df['repo_path'].apply(lambda r: r.split()[0])
df['path'] = df['repo_path'].apply(lambda r: r.split()[1])
df.drop(columns=['repo_path'], inplace=True)
df = df[['repo', 'path', 'content']]
return df
def parse_raw_data_into_function_list(blob, require_docstring: bool=True):
"""Extract per-function data from a given code blob.
Filters out undesirable function types. Keep only the first line of the docstring, and remove all internal comments from
the code.
Args:
blob: String containing some python code.
Returns:
List of functions represented by dictionaries containing the code, docstring and metadata.
"""
parsed_data_list = []
try:
try:
parsed_module = parso.parse(blob, error_recovery=False, version="2.7")
except parso.parser.ParserSyntaxError:
parsed_module = parso.parse(blob, error_recovery=False, version="3.7")
function_defs = list(parsed_module.iter_funcdefs())
for class_def in parsed_module.iter_classdefs():
function_defs.extend(class_def.iter_funcdefs())
for function_def in function_defs:
function_name = function_def.name.value
docstring_node = function_def.get_doc_node()
if docstring_node is None:
docstring = ''
else:
docstring = docstring_node.value
first_docstring_line = docstring.split('\n\s*\n')[0]
# We now need to un-indent the code which may have come from a class. For that, identify how far
# we are indented, and try to to remove that from all lines:
function_code = function_def.get_code()
def_prefix = list(function_def.get_first_leaf()._split_prefix())[-1].value
trimmed_lines = []
for line in function_code.splitlines():
if line.startswith(def_prefix):
trimmed_lines.append(line[len(def_prefix):])
function_code = '\n'.join(trimmed_lines)
should_use_function = not (re.search(r'(__.+__)|(.*test.*)|(.*Test.*)', function_name) or # skip __*__ methods and test code
re.search(r'NotImplementedException|@abstractmethod', function_code) or
len(function_code.split('\n')) <= 2 or # should have more than 1 line of code (the declaration is one line)
(len(first_docstring_line.split()) <= 2) and require_docstring) # docstring should have at least 3 words.
if should_use_function:
parsed_data_list.append({'code': function_code,
'docstring': first_docstring_line,
'language': 'python',
'lineno': function_def.start_pos[0],
'func_name': function_name,
})
except parso.parser.ParserSyntaxError:
pass
return parsed_data_list
def listlen(x):
if not isinstance(x, list):
return 0
return len(x)
def run(args):
azure_info_path = args.get('--azure-info')
output_folder = RichPath.create(args['OUTPUT_PATH'], azure_info_path)
# Download / read the data files:
if args['--input-folder'] is None:
print('Downloading data...')
raw_code_data_df = download_files_into_pandas()
else:
print('Loading data...')
raw_code_data_df = load_files_into_pandas(args['--input-folder'])
print('Data loaded.')
# Find all the functions and methods, filter out ones that don't meet requirements,
# separate the code from the docstring and produce a list of functions that includes the code,
# the first line of the docstring, and metadata of each:
with Pool() as pool:
function_data = pool.map(parse_raw_data_into_function_list, raw_code_data_df.content.tolist())
assert len(function_data) == raw_code_data_df.shape[0], \
f'Row count mismatch. `raw_code_data_df` has {raw_code_data_df.shape[0]} rows; `function_data` has {len(function_data)} rows.'
raw_code_data_df['function_data'] = function_data
print(f'Split {raw_code_data_df.shape[0]} blobs into {sum(len(fun_data) for fun_data in function_data)} documented individual functions.')
# Flatten function data out:
# TODO: We should also have access to the SHA of the objects here.
raw_code_data_df = raw_code_data_df.set_index(['repo', 'path'])['function_data'].apply(pd.Series).stack()
raw_code_data_df = raw_code_data_df.reset_index()
raw_code_data_df.columns = ['repo', 'path', '_', 'function_data']
# Extract meta-data and format dataframe.
function_data_df = pd.DataFrame(raw_code_data_df.function_data.values.tolist())
assert len(raw_code_data_df) == len(function_data_df), \
f'Row count mismatch. `raw_code_data_df` has {len(raw_code_data_df)} rows; `function_data_df` has {len(function_data_df)} rows.'
function_data_df = pd.concat([raw_code_data_df[['repo', 'path']], function_data_df], axis=1)
# remove observations where the same code appears more than once
num_before_dedup = len(function_data_df)
function_data_df = function_data_df.drop_duplicates(['code'])
num_after_dedup = len(function_data_df)
print(f'Removed {num_before_dedup - num_after_dedup} exact duplicate rows.')
print('Tokenizing code, comments and docstrings ...')
with Pool() as pool:
code_tokenization_results: List[ParsedCode] = pool.map(tokenize_python_from_string,
function_data_df['code'].tolist())
code_tokens_list, comment_tokens_list = list(zip(*code_tokenization_results))
function_data_df['code_tokens'] = code_tokens_list
function_data_df['comment_tokens'] = comment_tokens_list
function_data_df['docstring_tokens'] = pool.map(tokenize_docstring_from_string,
function_data_df['docstring'].tolist())
function_data_df.dropna(subset=['code_tokens', 'comment_tokens', 'docstring_tokens'], inplace=True)
function_data_df.reset_index(inplace=True, drop=True)
cols_to_keep = ['repo', 'path', 'lineno', 'func_name', 'language',
'code', 'code_tokens', 'comment_tokens',
'docstring', 'docstring_tokens',
]
# write data to jsonl
print(f'Count by language:\n{function_data_df.language.value_counts()}')
chunked_save_df_to_jsonl(df=function_data_df[cols_to_keep],
output_folder=output_folder,
parallel=True)
print(f'Wrote {function_data_df.shape[0]} rows to {str(output_folder)}.')
if __name__ == '__main__':
args = docopt(__doc__)
run_and_debug(lambda: run(args), args.get('--debug'))