diff --git a/DATASHEET.md b/DATASHEET.md index a789e0c..4a7cacd 100644 --- a/DATASHEET.md +++ b/DATASHEET.md @@ -59,7 +59,7 @@ If so, please provide a description. If not, you may skip the remainder of the q * Any other comments? ### Uses * Has the dataset been used for any tasks already? If so, please provide a description. *See upcoming arXiv paper.* -* Is there a repository that links to any or all papers or systems that use the dataset? If so, please provide a link or other access point. *Relevent papers will be linked to in GitHub* +* Is there a repository that links to any or all papers or systems that use the dataset? If so, please provide a link or other access point. *Relevant papers will be linked to in GitHub* * What (other) tasks could the dataset be used for? *Dataset could be used to teach or evaluate human programmers in Python as well as computers* * Is there anything about the composition of the dataset or the way it was collected and preprocessed/cleaned/labeled that might impact future uses? For example, is there anything that a future user might need to know to avoid uses that @@ -86,4 +86,4 @@ an algorithm.* No personal data* * Will older versions of the dataset continue to be supported/hosted/maintained? If so, please describe how. If not, please describe how its obsolescence will be communicated to users. *Older versions will remain on GitHub for reproducibility* * If others want to extend/augment/build on/contribute to the dataset, is there a mechanism for them to do so? If so, please provide a description. Will these contributions be validated/verified? If so, please describe how. If not, why not? Is there a process for communicating/distributing these contributions to other users? *Contributions are welcome and appreciated through GitHub* -* Any other comments? \ No newline at end of file +* Any other comments? diff --git a/ICLR2023/README.md b/ICLR2023/README.md new file mode 100755 index 0000000..124afa4 --- /dev/null +++ b/ICLR2023/README.md @@ -0,0 +1,147 @@ +This is the code and data for the paper: Language Models can teach themselves to code better +https://arxiv.org/abs/2207.14502 + +LICENSE +MIT License - as already specified in the ../LICENSE file of PythonProgrammingPuzzles repo + +GPU USAGE +GPU usage was large , especially for the 2.7B sized model which is ~20X the 125M. +Data generation takes the most GPU usage and took about 2500 GPU hours for 2.7B (on v100) +Finetuning on the 1M generated data took about 40 GPU hours for 2.7B (on v100) per epoch of finetuning - 10 epochs = 400 GPU hours +Solving the 228 problem testset with 100 attempts using the finetuned 2.7B model took about 4 hours (on v100) +We mostly used v100, but we used whatever was available, so T4 and A100 sometimes if they were free. +Tried everything at 125M first - debug there and make it work perfect - then roll out the 1.3 and 2.7 jobs + +DATASETS +In data directory are the datasets used. We feel the most interesting dataset is data/Codex_PAPER_1M_iter_0.txt +which is generated by Codex and gave the best results when finetuned on. All the datasets are part of our public release. + +SETUP +src/requirements.txt is what we install on our cluster machines - the cluster comes with NVidia drivers and matching pytorch +./requirements.txt is what I personally have installed on my local machine and tested this runs - but it has lots of stuff you don't need +So try src/requirements.txt only - and if that doesn't work - then /requirements.txt has all versions of everything installed on my machine +Getting a deepspeed 0.6.1 matching a pytorch matching a nvidia driver install was tricky for me on some machines, torch 1.10 and 1.11 both work + +GENERATING/FINETUNING -> run "cd src, ./babysit.sh GPU_INDEX_TO_USE" -> GPU_INDEX_TO_USE=0 typically +In src/babysit.sh is the script that generates data, and finetunes on that data in a loop, finetuning the GPT-Neo 125M/1.3B/2.7B models +In src/babysit.sh TEST_LOCAL=1 controls running locally on machine's GPUs which is great for fast testing, or =0 is launching on the cluster which is slow but has lots of GPUs +Realistically you have to train on a cluster - data generation takes a long time so having lots of machines all generating data is the feasible approach. +But given enough time - this will run locally on 1 GPU. 1 year for 2.7B, or 2 weeks for 125M. +We found generating 75k samples after deduping worked for iteration_0 - finetune on that data. +Then using that fine_tuned model in iter_1 generating data happens more quickly - the finetuned model solves many more problems +Repeating that process works well. +On 125M we looked at just training on only 125M generated data from iter_0 versus iter_1 versus iter_2 - generating 600K for each iteration. +It seemed finetuning on iter_2 data was best on the testset 26.9/228 solved vs iter_1=26.1/228 vs iter_0=22.2/228 +With 1M samples from 125M generated data sampled across all the iterations 0,1,2 we got 26.75/228 +We understand why it's faster to generate iter_2 data on a finetuned model - it solves more problems. +But why are the generated puzzles&solutions better for training the model on? +We will explore that more in the future - and try iterating a lot farther than 3 iterations - although our preliminary experiments on 125M show it tops out at 3 iterations + +FINETUNING ONLY -> run "cd src, ./fine_tune1.sh GPU_INDEX_TO_USE" -> GPU_INDEX_TO_USE=0 typically +# ./fine_tune1.sh GPU MODEL_TO_TRAIN EXPERIMENT_NAME_DIRECTORY TRAIN_DATA EPOCHS +This allows the repeated finetuning on a specific dataset. +Use this to do a temperature grid search, or try different variations of parameters on a specific dataset. + +Detailed instructions for reproducing experiments: +# Generating Codex data +python gen.py -n=32 -max_tokens=4096 -model_path=openai/code-davinci-002 -model_path_solve=openai/code-cushman-001 -out=../data/codex/iter_0 -seed=2022 + +# Measuring codex accuracy via API calls +./solve2.sh +python solve.py -prefix=../data/train_prefix.txt -attempts=1 -model_path=openai/code-cushman-001 -gpu=0 -fixed_temp=0.8 -out=../data/codex -puzzles=../data/test_228.json -seed=2022 -batch_size=64 + +# Producing verified Codex_PAPER_1M_iter_0.txt from the puzzle/solution old style data generated by Codex +python preprocess.py -path=../data/codex/old_verified -f_name=Codex_PAPER_1M_iter_0.txt -max_sols_per_puzzle=8 -old_style_json=True -max_examples=1000000 -include_failures=False -seed=2022 +cp ../data/codex/old/Codex_PAPER_1M_iter_0.txt ../data/Codex_PAPER_1M_iter_0.txt + +# Producing unverified Codex_unverified_PAPER_1M_iter_0.txt from the puzzle/solution old style data generated by Codex +python preprocess.py -path=../data/codex/old_unverified -f_name=Codex_unverified_PAPER_1M_iter_0.txt -max_sols_per_puzzle=8 -old_style_json=True -max_examples=1000000 -include_failures=True -seed=2022 +cp ../data/codex/old_unverified/Codex_unverified_PAPER_1M_iter_0.txt ../data/Codex_unverified_PAPER_1M_iter_0.txt + +# Producing 125M_PAPER_25K_iter_0.txt from the puzzle/solution new style data +python preprocess.py ../data/125M_PAPER/iter_0 125M_PAPER_25K_iter_0.txt 8 False 25000 False -seed=2022 +cp ../data/125M_PAPER/iter_0/125M_PAPER_25K_iter_0.txt ../data/125M_PAPER_25K_iter_0.txt + +# Producing 125M_PAPER_1M_iter_1.txt from the puzzle/solution new style data +python preprocess.py ../data/125M_PAPER/iter_1 125M_PAPER_1M_iter_1.txt 8 False 1000000 False -seed=2022 +cp ../data/125M_PAPER/iter_1/125M_PAPER_1M_iter_1.txt ../data/125M_PAPER_1M_iter_1.txt + +# Producing 125M_PAPER_1M_iter_2.txt from the puzzle/solution new style data13B +python preprocess.py ../data/125M_PAPER/iter_2 125M_PAPER_1M_iter_2.txt 8 False 1000000 False -seed=2022 +cp ../data/125M_PAPER/iter_2/125M_PAPER_1M_iter_2.txt ../data/125M_PAPER_1M_iter_2.txt + +# Producing 13B_PAPER_25K_iter_0.txt from the puzzle/solution new style data +python preprocess.py ../data/13B_PAPER/iter_0 13B_PAPER_25K_iter_0.txt 8 False 25000 False -seed=2022 +cp ../data/13B_PAPER/iter_0/13B_PAPER_25K_iter_0.txt ../data/13B_PAPER_25K_iter_0.txt + +# Producing 13B_PAPER_1M_iter_1.txt from the puzzle/solution new style data +python preprocess.py ../data/13B_PAPER/iter_1 13B_PAPER_1M_iter_1.txt 8 False 1000000 False -seed=2022 +cp ../data/13B_PAPER/iter_1/13B_PAPER_1M_iter_1.txt ../data/13B_PAPER_1M_iter_1.txt + +# Producing 13B_PAPER_1M_iter_2.txt from the puzzle/solution new style data +python preprocess.py ../data/13B_PAPER/iter_2 13B_PAPER_1M_iter_2.txt 8 False 1000000 False -seed=2022 +cp ../data/13B_PAPER/iter_2/13B_PAPER_1M_iter_2.txt ../data/13B_PAPER_1M_iter_2.txt + +# Producing 27B_PAPER_25K_iter_0.txt from the puzzle/solution new style data +python preprocess.py ../data/27B_PAPER/iter_0 27B_PAPER_25K_iter_0.txt 8 False 25000 False -seed=2022 +cp ../data/27B_PAPER/iter_0/27B_PAPER_25K_iter_0.txt ../data/27B_PAPER_25K_iter_0.txt + +# Producing 27B_PAPER_1M_iter_1.txt from the puzzle/solution new style data +python preprocess.py ../data/27B_PAPER/iter_1 27B_PAPER_1M_iter_1.txt 8 False 1000000 False -seed=2022 +cp ../data/27B_PAPER/iter_1/27B_PAPER_1M_iter_1.txt ../data/27B_PAPER_1M_iter_1.txt + +# Producing 27B_PAPER_1M_iter_2.txt from the puzzle/solution new style data +python preprocess.py ../data/27B_PAPER/iter_2 27B_PAPER_1M_iter_2.txt 8 False 1000000 False -seed=2022 +cp ../data/27B_PAPER/iter_2/27B_PAPER_1M_iter_2.txt ../data/27B_PAPER_1M_iter_2.txt + +# Data files produced by babysit.sh - generating data from gpt-neo-* and Codex +# At the time of experiments running, Codex wasn't finetunable, so only iteration 0 data was available +Codex_PAPER_1M_iter_0.txt +125M_PAPER_25K_iter_0.txt +13B_PAPER_25K_iter_0.txt +27B_PAPER_25K_iter_0.txt +125M_PAPER_1M_iter_1.txt +13B_PAPER_1M_iter_1.txt +27B_PAPER_1M_iter_1.txt +125M_PAPER_1M_iter_2.txt +13B_PAPER_1M_iter_2.txt +27B_PAPER_1M_iter_2.txt + +# Figure 5 - 3 diagrams - showing the 3 GPT models trained on verified codex vs unverified codex vs baseline +# 5a GPT-NEO 125M +./fine_tune1.sh 0 125M ft1_Codex_PAPER_1M_iter_0 Codex_PAPER_1M_iter_0.txt +./fine_tune1.sh 0 125M ft1_Codex_unverified_PAPER_1M_iter_0 Codex_unverified_PAPER_1M_iter_0.txt +./solve1.sh 0 125M 10 228 +# 5b GPT-NEO 13B +./fine_tune1.sh 0 13B ft1_Codex_PAPER_1M_iter_0 Codex_PAPER_1M_iter_0.txt +./fine_tune1.sh 0 13B ft1_Codex_unverified_PAPER_1M_iter_0 Codex_unverified_PAPER_1M_iter_0.txt +./solve1.sh 0 13B 10 228 5 +# 5c GPT-NEO 27B +./fine_tune1.sh 0 27B ft1_Codex_PAPER_1M_iter_0 Codex_PAPER_1M_iter_0.txt +./fine_tune1.sh 0 27B ft1_Codex_unverified_PAPER_1M_iter_0 Codex_unverified_PAPER_1M_iter_0.txt +./solve1.sh 0 13B 10 228 5 + +# Figure 6 - 3 diagrams - showing test228 Pass@ for the 3 GPT models trained on data from 4 generators (codex and 3 GPT-Neo) and baseline +# 6a - GPT-NEO 125M trained on 4 different datasets and baseline +# ./fine_tune1.sh 0 125M ft1_Codex_PAPER_1M_iter_0 Codex_PAPER_1M_iter_0.txt (dupe of 5a) +./fine_tune1.sh 0 125M ft1_125M_PAPER_1M_iter_2 125M_PAPER_1M_iter_2.txt +./fine_tune1.sh 0 125M ft1_13B_PAPER_1M_iter_2 13B_PAPER_1M_iter_2.txt +./fine_tune1.sh 0 125M ft1_27B_PAPER_1M_iter_2 27B_PAPER_1M_iter_2.txt + +# 6b - GPT-NEO 13B trained on 4 different datasets and baseline +# ./fine_tune1.sh 0 13B ft1_Codex_PAPER_1M_iter_0 Codex_PAPER_1M_iter_0.txt (dupe of 5b) +./fine_tune1.sh 0 13B ft1_125M_PAPER_1M_iter_2 125M_PAPER_1M_iter_2.txt +./fine_tune1.sh 0 13B ft1_13B_PAPER_1M_iter_2 13B_PAPER_1M_iter_2.txt +./fine_tune1.sh 0 13B ft1_27B_PAPER_1M_iter_2 27B_PAPER_1M_iter_2.txt + +# 6c - GPT-NEO 27B trained on 4 different datasets and baseline +# ./fine_tune1.sh 0 27B ft1_Codex_PAPER_1M_iter_0 Codex_PAPER_1M_iter_0.txt (dupe of 5c) +./fine_tune1.sh 0 27B ft1_125M_PAPER_1M_iter_2 125M_PAPER_1M_iter_2.txt +./fine_tune1.sh 0 27B ft1_13B_PAPER_1M_iter_2 13B_PAPER_1M_iter_2.txt +./fine_tune1.sh 0 27B ft1_27B_PAPER_1M_iter_2 27B_PAPER_1M_iter_2.txt + +# Launch on torch2020 - edit solve.yaml for correct parameters of model and epoch +./tst_human_eval_base.sh 0 125M 1024 +./tst_human_eval_ft1.sh 0 125M 1024 +./tst_human_eval_ft5.sh 0 125M 1024 +./tst_human_eval_ft10.sh 0 125M 1024 diff --git a/ICLR2023/data/125M_PAPER_1M_iter_1.txt.gz b/ICLR2023/data/125M_PAPER_1M_iter_1.txt.gz new file mode 100644 index 0000000..948467e Binary files /dev/null and b/ICLR2023/data/125M_PAPER_1M_iter_1.txt.gz differ diff --git a/ICLR2023/data/13B_PAPER_1M_iter_1.txt.gz b/ICLR2023/data/13B_PAPER_1M_iter_1.txt.gz new file mode 100644 index 0000000..f4fee2c Binary files /dev/null and b/ICLR2023/data/13B_PAPER_1M_iter_1.txt.gz differ diff --git a/ICLR2023/data/27B_PAPER_1M_iter_1.txt.gz b/ICLR2023/data/27B_PAPER_1M_iter_1.txt.gz new file mode 100644 index 0000000..d82c3fc Binary files /dev/null and b/ICLR2023/data/27B_PAPER_1M_iter_1.txt.gz differ diff --git a/ICLR2023/data/350M_PAPER_1M_iter_0.txt.gz b/ICLR2023/data/350M_PAPER_1M_iter_0.txt.gz new file mode 100644 index 0000000..f51941b Binary files /dev/null and b/ICLR2023/data/350M_PAPER_1M_iter_0.txt.gz differ diff --git a/ICLR2023/data/Codex_PAPER_1M_iter_0.txt.gz b/ICLR2023/data/Codex_PAPER_1M_iter_0.txt.gz new file mode 100644 index 0000000..0a1c127 Binary files /dev/null and b/ICLR2023/data/Codex_PAPER_1M_iter_0.txt.gz differ diff --git a/ICLR2023/requirements.txt b/ICLR2023/requirements.txt new file mode 100644 index 0000000..11cc109 --- /dev/null +++ b/ICLR2023/requirements.txt @@ -0,0 +1,249 @@ +adal==1.2.7 +aiohttp==3.8.5 +aiosignal==1.2.0 +amlt==8.0.9 +applicationinsights==0.11.10 +asn1crypto==0.24.0 +astor==0.8.1 +async-timeout==4.0.1 +attrs==17.4.0 +Automat==0.6.0 +azure-common==1.1.27 +azure-core==1.17.0 +azure-data-tables==12.0.0b6 +azure-graphrbac==0.61.1 +azure-identity==1.4.1 +azure-mgmt-authorization==0.61.0 +azure-mgmt-containerregistry==2.8.0 +azure-mgmt-keyvault==2.2.0 +azure-mgmt-resource==13.0.0 +azure-mgmt-storage==11.2.0 +azure-storage-blob==2.1.0 +azure-storage-common==2.1.0 +azure-storage-file==2.1.0 +azureml-automl-core==1.26.0 +azureml-contrib-k8s==0.1.16 +azureml-contrib-pipeline-steps==1.26.0 +azureml-core==1.26.0 +azureml-dataprep==2.13.2 +azureml-dataprep-native==32.0.0 +azureml-dataprep-rslex==1.11.2 +azureml-dataset-runtime==1.26.0 +azureml-k8s-mt==1.0.4 +azureml-pipeline-core==1.26.0 +azureml-pipeline-steps==1.26.0 +azureml-telemetry==1.26.0 +azureml-train-automl-client==1.26.0 +azureml-train-core==1.26.0 +azureml-train-restclients-hyperdrive==1.26.0 +backcall==0.2.0 +backports.tempfile==1.0 +backports.weakref==1.0.post1 +beautifulsoup4==4.9.3 +bitstring==3.1.9 +black==21.8b0 +blinker==1.4 +blis==0.7.4 +blobxfer==1.10.0 +cachetools==4.2.2 +catalogue==2.0.6 +certifi==2023.7.22 +cffi==1.14.6 +chardet==3.0.4 +charset-normalizer==2.0.7 +click==7.1.2 +click-completion @ git+https://github.com/temporaer/click-completion.git@41b21868cac0781d25b37da624bae2fd1f36be88 +click-option-group==0.5.3 +click-plugins==1.1.1 +cloud-init==20.2 +cloudpickle==1.6.0 +colorama==0.3.7 +colorlog==6.4.1 +command-not-found==0.3 +configobj==5.0.6 +configparser==5.0.2 +constantly==15.1.0 +contextlib2==21.6.0 +cryptography==41.0.4 +cycler==0.10.0 +cymem==2.0.5 +datasets==1.15.1 +debugpy==1.4.3 +decorator==5.0.9 +deepspeed==0.5.1 +dill==0.3.4 +distro==1.6.0 +distro-info===0.18ubuntu0.18.04.1 +docker==5.0.1 +docker-pycreds==0.4.0 +dotnetcore2==2.1.21 +ecdsa==0.17.0 +entrypoints==0.3 +et-xmlfile==1.1.0 +fail2ban==0.10.2 +fastai==2.5.2 +fastcore==1.3.26 +fastdownload==0.0.5 +fastprogress==1.0.0 +filelock==3.0.12 +Flask==2.3.2 +Flask-Cors==3.0.10 +Flask-Executor==0.9.4 +Flask-FontAwesome==0.1.5 +frozenlist==1.2.0 +fsspec==2021.11.0 +gitdb==4.0.7 +GitPython==3.1.35 +httplib2==0.19.0 +huggingface-hub==0.1.2 +humanize==3.11.0 +hyperlink==17.3.1 +idna==2.6 +incremental==16.10.1 +ipdb==0.13.9 +ipykernel==6.4.1 +ipython==8.10.0 +ipython-genutils==0.2.0 +isodate==0.6.0 +itsdangerous==2.0.1 +jedi==0.18.0 +Jinja2==3.0.1 +jmespath==0.10.0 +joblib==1.2.0 +jsonpatch==1.16 +jsonpickle==2.0.0 +jsonpointer==1.10 +jsonschema==2.6.0 +jupyter-client==7.0.5 +jupyter-core==4.11.2 +keyring==10.6.0 +keyrings.alt==3.0 +kiwisolver==1.3.2 +language-selector==0.1 +libtmux==0.10.1 +Mako==1.2.2 +MarkupSafe==2.0.1 +marshmallow==3.10.0 +matplotlib==3.4.3 +matplotlib-inline==0.1.3 +mlb-core==0.0.4 +msal==1.14.0 +msal-extensions==0.2.2 +msrest==0.6.19 +msrestazure==0.6.4 +multidict==5.2.0 +multiprocess==0.70.12.2 +murmurhash==1.0.5 +mypy-extensions==0.4.3 +ndg-httpsclient==0.5.1 +nest-asyncio==1.5.1 +netifaces==0.10.4 +ninja==1.10.2 +ntlm-auth==1.5.0 +numpy==1.22.0 +oauthlib==3.2.2 +openai==0.13.0 +openpyxl==3.0.9 +orderedset==2.0.3 +packaging==21.0 +PAM==0.4.2 +pandas==1.3.2 +pandas-stubs==1.2.0.45 +parso==0.8.2 +passpy==1.0.2 +pathspec==0.9.0 +pathtools==0.1.2 +pathy==0.6.0 +Pebble==4.6.3 +petname==2.6 +pexpect==4.8.0 +pickleshare==0.7.5 +Pillow==10.0.1 +platformdirs==2.3.0 +portalocker==1.7.1 +preshed==3.0.5 +promise==2.3 +prompt-toolkit==3.0.20 +protobuf==3.18.3 +psb2==1.0.0 +psutil==5.8.0 +ptyprocess==0.7.0 +pyarrow==1.0.1 +pyasn1==0.4.2 +pyasn1-modules==0.2.1 +pycparser==2.20 +pycrypto==2.6.1 +pydantic==1.8.2 +Pygments==2.15.0 +PyGObject==3.26.1 +PyJWT==2.4.0 +pyOpenSSL==17.5.0 +pyparsing==2.4.7 +pyperclip==1.8.2 +pyserial==3.4 +python-apt==1.6.5+ubuntu0.3 +python-dateutil==2.8.2 +python-debian==0.1.32 +python-gnupg==0.4.7 +pytz==2021.1 +pyxdg==0.26 +PyYAML==5.4.1 +pyzmq==22.3.0 +regex==2021.8.28 +requests==2.31.0 +requests-ntlm==1.1.0 +requests-oauthlib==1.3.0 +requests-unixsocket==0.1.5 +ruamel.yaml==0.17.16 +ruamel.yaml.clib==0.2.6 +sacremoses==0.0.45 +scikit-learn==0.24.2 +scipy==1.10.0 +SecretStorage==2.3.1 +sentry-sdk==1.14.0 +service-identity==16.0.0 +shellingham==1.4.0 +shortuuid==1.0.1 +six==1.16.0 +sklearn==0.0 +smart-open==5.2.1 +smmap==4.0.0 +soupsieve==2.2.1 +spacy==3.1.2 +spacy-legacy==3.0.8 +srsly==2.4.1 +ssh-import-id==5.7 +sshpubkeys==3.3.1 +strictfire==0.4.1 +subprocess32==3.5.4 +systemd-python==234 +tabulate==0.8.9 +tensorboardX==1.8 +termcolor==1.1.0 +thinc==8.0.10 +threadpoolctl==2.2.0 +tokenizers==0.10.3 +toml==0.10.2 +tomli==1.2.1 +torch==1.13.1 +torchvision==0.10.0 +tornado==6.3.3 +tqdm==4.62.2 +traitlets==5.1.0 +transformers==4.30.0 +Twisted==22.10.0 +typer==0.3.2 +typing-extensions==3.10.0.2 +ufw==0.36 +unattended-upgrades==0.1 +urllib3==1.26.17 +virtualenv==15.1.0 +WALinuxAgent==2.2.45 +wasabi==0.8.2 +wcwidth==0.2.5 +websocket-client==1.2.1 +Werkzeug==2.2.3 +xdg==5.1.1 +xxhash==2.0.2 +yarl==1.7.2 +zope.interface==4.3.2 diff --git a/ICLR2023/src/babysit.sh b/ICLR2023/src/babysit.sh new file mode 100755 index 0000000..cfb3359 --- /dev/null +++ b/ICLR2023/src/babysit.sh @@ -0,0 +1,161 @@ +#!/bin/bash +# All Experiment Settings - constant through the experiment run - passed to gen.sh and fine_tune.sh as needed +GPU=0 # which GPU to use +MODEL="125M" # MODEL is the size of the model: 125M, 13B, 27B +EXPERIMENT=$MODEL"_PAPER" # Name of Experiment directory under data/* and models/base-model/* to store results +TEST_LOCAL=1 # 0 means run gen/fine_tune on cluster remotely, 1 means run gen/fine_tune locally +TARGET_NUM_FILES=1 # How many files to generate in each iteration before starting fine_tuning. Count of unique examples would have been better. +ITER_START=0 # inclusive index to start processing at - creates iter_# under data&models at each iteration. Can continue prev runs by start at prev ITER_MAX +ITER_MAX=5 # exclusive index to stop processing iterations at +EPOCHS_START=1 # inclusive index of epochs to start processing at - could continue prev run by starting at prev EPOCHS_MAX+1 - 0th epoch is the default model so epoch starts at 1 +EPOCHS_MAX=4 # inclusive index of epochs to stop processing at +EPOCHS_PER_STEP=1 # How many EPOCHS through the data to do in each step +TRAIN_INCREMENTAL=0 # Only train on data from the latest iteration, and start finetuning on the last finetuned model - otherwise start from scratch and use all the data generated +TRAIN_BOOST=0 # Initial generation of data from default model is slow - 1 means looks in 125M_RL_ALL to use previous generated initial data to bootstrap. +PASS_AT_K=100 # PASS_AT_K says do K trials to solve to compute Pass@K +LINE_LOG_K=11 # LINE_LOG_K is how many lines of results from solve have results for saving + +echo babysit args: $# $0 $1 $2 $3 $4 + +if (( $# \!= 1 )) +then + echo babysit.sh only takes 1 argument, unless called by another script to initialize configuration variables + return +fi + +if (( $# \>= 1 )) +then + GPU=$1 +fi + +echo babysit GPU $GPU + +for (( iteration=$ITER_START; iteration<$ITER_MAX; iteration++ )) +do + FULLNAME="${EXPERIMENT}---${iteration}" + echo FULLNAME $FULLNAME + export FULLNAME # Needed to pass variable off to yaml job + DATAPATH=data/${EXPERIMENT}/iter_$iteration + echo DATAPATH $DATAPATH + + if (( $TEST_LOCAL \> 0 )) + then + count=`ls -lt ../${DATAPATH} | grep json | wc -l` + else + count=`amlt sto list ${DATAPATH} | grep json | wc -l` + fi + echo count $count + + # Instead of file count we might want to check if the amount of data from preprocess is sufficient + # If not we call to generate more + + if (( $count \> 0 )) + then + echo "$FULLNAME has already been started" + echo "You are resuming at iteration $iteration" + echo "You already have $count files of data this iteration" + else + echo "$FULLNAME is starting generation for iteration $iteration" + fi + + if (( $count \< $TARGET_NUM_FILES )) + then + if (( $TEST_LOCAL \> 0 )) + then + # ./gen.sh $GPU 2560 100 $FULLNAME -1 + # 2.7B 384 100 runs ~10 hours + # 2.7B 160 100 runs ~4.5 hours + ./gen.sh $GPU 256000 100 $FULLNAME -1 + else + amlt run hyper_gen_octows.yaml $FULLNAME -d "$FULLNAME" + exit + fi + fi + + # Running local you are done, but launching on the cloud, you have to wait + for (( poll=0; poll<500; poll++ )) + do + if (( $TEST_LOCAL \> 0 )) + then + count=`ls -lt ../${DATAPATH} | grep json | wc -l` + else + count=`amlt sto list ${DATAPATH} | grep json | wc -l` + fi + + echo "gen wait - Iteration: $iteration, Poll: $poll, Count: $count" + + if (( $count \>= $TARGET_NUM_FILES )) + then + echo "Finished generation iteration $iteration after $poll polls" + break + fi + sleep 3m + done + + # Start a finetune job + if (( $TEST_LOCAL \> 0 )) + then + ./fine_tune.sh $GPU $FULLNAME + else + # Pass enviroment variable FULLNAME to amlt.yaml + amlt run amlt_octo.yaml $FULLNAME -d "$FULLNAME" + exit + fi + + # On cluster we need to wait for finetune job to finish, run locally it's done + # Check the log files for starting the running of solve have been created for the last epoch of training + + MODELPATH=models/gpt-neo-$MODEL/${EXPERIMENT}/iter_$iteration + SOLVE_PATH=$MODELPATH/"epoch_"$EPOCHS_MAX/"solve_"$PASS_AT_K + echo babysit.sh SOLVE_PATH $SOLVE_PATH + + for (( poll=0; poll<500; poll++ )) + do + if (( $TEST_LOCAL \> 0 )) + then + count=`ls -lt ../$SOLVE_PATH | grep json | wc -l` + else + count=`amlt sto list $SOLVE_PATH | grep json | wc -l` + fi + + echo "fine_tune wait - Iteration: $iteration, Poll: $poll, Count: $count" + + if (( $count \>= 1 )) + then + echo "Finished fine_tune iteration $iteration after $poll polls" + break + fi + sleep 3m + done + +done + +# Pull all the results into 1 log file to look at more easily + +if [[ -z "${AMLT_DATA_DIR}" ]]; +then + # running locally on torch2020 so we don't have AMLT enviroment variables defined, so need to set them up + AMLT_DATA_DIR=../data +else + # On remote we don't have access to the log files - maybe could do amlt sto download to do this summary below ? + exit +fi + +BASE_MODEL_PATH=$AMLT_DATA_DIR/../models/gpt-neo-$MODEL +LOG_FILE=$BASE_MODEL_PATH/$EXPERIMENT/"solve_"$PASS_AT_K".txt" +echo solve LOG_FILE for babysit.sh is $LOG_FILE +rm $LOG_FILE + +for (( iteration=$ITER_START; iteration<$ITER_MAX; iteration++ )) +do + for (( epochs=$EPOCHS_START; epochs<=$EPOCHS_MAX; epochs++ )) + do + EPOCH_NAME="epoch_"$epochs + STEP_PATH=$BASE_MODEL_PATH/$EXPERIMENT/iter_$iteration/$EPOCH_NAME + MODEL_PATH=$STEP_PATH/finetuned + echo iteration $iteration epoch $epochs >> $LOG_FILE + head -$LINE_LOG_K $STEP_PATH/"solve_"$PASS_AT_K/results.json >> $LOG_FILE + done +done + +cat $LOG_FILE diff --git a/ICLR2023/src/ds_config_gptneo.json b/ICLR2023/src/ds_config_gptneo.json new file mode 100644 index 0000000..91b4864 --- /dev/null +++ b/ICLR2023/src/ds_config_gptneo.json @@ -0,0 +1,43 @@ +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + }, + "scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + }, + "zero_optimization": { + "stage": 2, + "allgather_partitions": true, + "allgather_bucket_size": 2e8, + "overlap_comm": true, + "reduce_scatter": true, + "reduce_bucket_size": 2e8, + "contiguous_gradients": true, + "cpu_offload": true + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 2000, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": false +} diff --git a/ICLR2023/src/fine_tune.py b/ICLR2023/src/fine_tune.py new file mode 100644 index 0000000..960fb55 --- /dev/null +++ b/ICLR2023/src/fine_tune.py @@ -0,0 +1,128 @@ +from strictfire import StrictFire as Fire # aborts early on invalid arguments +import os +import csv +import subprocess +import shlex +import random +import numpy as np +import torch +import utils + +def fine_tune( + train_txt="../data/generated_sol_100.txt", + output_dir = "../outputs/", + subdir="out", + model_path="EleutherAI/gpt-neo-2.7B", + gpu=0, + num_gpus=1, + epochs=4, + seed=0, + ): + """ + Fine tune the model on the puzzles in train_txt file and save the results to OUTPUT_DIR/output_subdir + + train_txt: the (possibly gzipped) file containing the text to fine tune on (default: ../data/generated_sol_100.txt) + subdir: the subdirectory to save the results to (default "out") + model_path: the path to the model to fine tune (default "EleutherAI/gpt-neo-2.7B") + gpu: which GPU(s) to use, e.g.: 0,1 (default 0) + epochs: how many epochs to train for (default 4) + seed: the random seed to use, not sure if this affects fine tuning (default 0) + """ + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + + # create output dir if necessary + output_path = os.path.join(output_dir, subdir) + if not os.path.exists(output_path): + os.makedirs(output_path) + + + text = utils.load_text_file(train_txt) # decompresses if ends in .gz + tokenizer = utils.load_tokenizer(model_path) + num_toks = utils.num_tokens(text, tokenizer, verbose=True) + assert num_toks > 1024, "Not enough tokens in text to fine tune" + + # create csv + train_file = os.path.join(output_path, "train.csv") + with open(train_file, mode="w", encoding="utf-8") as csv_file: + fieldnames = ["text"] + writer = csv.DictWriter(csv_file, fieldnames=fieldnames) + writer.writeheader() + writer.writerow({"text": text}) + + output_path_finetuned = os.path.join(output_path, "finetuned") + + # keep gradient_accumulation_steps at 1 bc setting it to 2 effectively doubles the batch + # size which gets tricky when batch sizes are small (ft_tokens will no longer be accurate) + gradient_accumulation_steps = 1 + per_device_train_batch_size = 4 + + cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "") + if len(cuda_visible_devices): + print("os.environ(CUDA_VISIBLE_DEVICES)", cuda_visible_devices) + del os.environ["CUDA_VISIBLE_DEVICES"] + print("os.environ(CUDA_VISIBLE_DEVICES)", os.environ.get("CUDA_VISIBLE_DEVICES", "")) + + master_port = 29600 # During training deepspeed uses a port to syncronize. 2 jobs need to set different ports to run in parallel + if type(gpu) in [list, tuple]: + master_port += gpu[0] + gpu = ",".join([str(g) for g in gpu]) + else: + master_port += gpu + + gpu_string = f'--include=localhost:{gpu}' + + if num_gpus > 1: + gpu_string = f"--num_nodes=1 --num_gpus={num_gpus}", + # If gpu is passed in as negative - it's the count of gpu to use - a bit of a hack + if gpu < 0: + num_gpus = abs(gpu) + gpu_string = f"--num_nodes=1 --num_gpus={num_gpus}" + + print("gpu_string", gpu_string) + + cmd = " ".join( + [ + "deepspeed", + f"--master_port={master_port}", + gpu_string, + # f'--include=localhost:{gpu}', + # "--num_nodes=1", + # f"--num_gpus={num_gpus}", + "neo_train.py", + f"--model_name_or_path={model_path}", + f"--train_file={train_file}", + f"--output_dir={output_path_finetuned}", + "--overwrite_output_dir", + "--ignore_data_skip", + "--deepspeed", + "ds_config_gptneo.json", + f"--save_strategy=no", # ATK remove checkpointing for large datasets + # pretty sure this is just dataset cache + "--overwrite_cache", + # logging frequency + "--logging_steps=5", + "--do_train", + "--report_to none", # turns off report_to WANDB for instance + "--fp16", + f"--num_train_epochs={epochs}", + # overrides num_train_epochs if set to a positive value. This is the number of gradient steps that happen total. + f"--per_device_train_batch_size={per_device_train_batch_size}", + "--use_fast_tokenizer=False", + f"--gradient_accumulation_steps={gradient_accumulation_steps}", + "--learning_rate=5e-06", + # linear increase from this up to learning rate, then LR schedule happens (which itself is linear decreasing until max_steps) + "--warmup_steps=10", + ] + ) + + utils.info(f"running command: {cmd}") + print(f"Command to run:{cmd}") # Why is this different than what utils.info prints out, utils.info truncates it + # exit() + res = subprocess.run(shlex.split(cmd), check=True) + utils.info(str(res)) + + +if __name__ == "__main__": + Fire(fine_tune) diff --git a/ICLR2023/src/fine_tune.sh b/ICLR2023/src/fine_tune.sh new file mode 100755 index 0000000..3d660c9 --- /dev/null +++ b/ICLR2023/src/fine_tune.sh @@ -0,0 +1,103 @@ +#!/bin/bash +echo fine_tune.sh args: $# $0 $1 $2 $3 $4 +# Grab the configuration variables +. babysit.sh + +# On AMLT machines we don't specify which GPU to use +# GPU="-1" +if [[ -z "${AMLT_DATA_DIR}" ]]; then + # running locally on torch2020 so we don't have AMLT enviroment variables defined, so need to set them up + AMLT_DATA_DIR=../data + # On torch2020 we do specify which GPU to use + # GPU="0" +fi + +# assert that there are at least 2 argument +if (( $# \< 2 )) +then + echo "Usage: $0 " + exit +fi + +GPU=$1 +FULLNAME=$2 + +# split by ; fullname string into experiment name and iteration +# e.g. "125M_RL---0" --> "125M_RL;0" +SPLIT=(${FULLNAME//---/ }) +EXPERIMENT=${SPLIT[0]} +ITERATION=${SPLIT[1]} +OUTPATH=$AMLT_DATA_DIR/$EXPERIMENT/iter_$ITERATION + +echo GPU $GPU +echo EXPERIMENT $EXPERIMENT +echo ITERAION $ITERATION +echo OUTPATH $OUTPATH + +# GPU_SOLVE is the GPU we want solve to use. Solve currently only uses 1 GPU - it would be great to make it use more when they are available. +# if GPU is negative - that tells fine_tune how many GPU to use on cluster - and we need to set GPU for solve to 0 on cluster +# if GPU is positive - we are running locally on torch2020 - and we need to leave the GPU set properly +GPU_SOLVE=$GPU +if (( $GPU \< 0 )) +then + GPU_SOLVE=0 +fi +echo GPU_SOLVE $GPU_SOLVE + +python preprocess.py $OUTPATH + +TRN_FILE=$OUTPATH/gen_ps_filtered.txt +echo TRN_FILE $TRN_FILE +TST_FILE=$AMLT_DATA_DIR/test_228.json + +BASE_MODEL_PATH=$AMLT_DATA_DIR/../models/gpt-neo-$MODEL +# 125M is copied locally to start +MODEL_PATH=$BASE_MODEL_PATH +MODEL_PATH=EleutherAI/gpt-neo-125M # !!! Just for paper release +# 13B is off in the cloud to start +if [[ "$MODEL" == "13B" ]]; then + MODEL_PATH=EleutherAI/gpt-neo-1.3B +fi +# 27B is off in the cloud tto start +if [[ "$MODEL" == "27B" ]]; then + MODEL_PATH=EleutherAI/gpt-neo-2.7B +fi + +echo MODEL MODEL_PATH $MODEL $MODEL_PATH + +# Training incremental means use the previous iterations trained model, and just the additional iteration's new data to fine_tune on. +# Otherwise use the base model - and retrain from scratch on all the data from all previous iterations. +# They are sort of equivalent - except from scratch picks up any extra data that was generated - and mixes all the iterations data together - but slower. +if (( $TRAIN_INCREMENTAL \> 0 )) +then + PREV_ITERATION=$((ITERATION-1)) + echo $PREV_ITERATION + TEST=$AMLT_DATA_DIR/../models/gpt-neo-$MODEL/$EXPERIMENT/iter_$PREV_ITERATION/epoch_$EPOCHS_MAX/finetuned + + if [ -a $TEST ] # exists + then + MODEL_PATH=$TEST + echo fine_tune.sh using previous iteration model + fi +fi + +echo "fine_tune.sh starting from NEO model at: ${MODEL_PATH}" + +# Pull all the results into 1 log file to look at more easily +LOG_FILE=$BASE_MODEL_PATH/$EXPERIMENT/iter_$ITERATION/"solve.txt" +echo solve LOG_FILE for fine_tune.sh is $LOG_FILE +rm $LOG_FILE + +for (( epochs=$EPOCHS_START; epochs<=$EPOCHS_MAX; epochs++ )) +do + EPOCH_NAME="epoch_"$epochs + EPOCHS_STEP=$(($EPOCHS_PER_STEP * $epochs)) + python fine_tune.py -train_txt=$TRN_FILE -gpu=$GPU -output_dir=$BASE_MODEL_PATH/$EXPERIMENT/iter_$ITERATION -subdir=$EPOCH_NAME -model_path=$MODEL_PATH -epochs=$EPOCHS_STEP + # measure the finetuned model's accuracy + STEP_PATH=$BASE_MODEL_PATH/$EXPERIMENT/iter_$ITERATION/$EPOCH_NAME + MODEL_PATH=$STEP_PATH/finetuned + python solve.py -prefix=$AMLT_DATA_DIR/train_prefix.txt -attempts=$PASS_AT_K -model_path=$MODEL_PATH -gpu=$GPU_SOLVE -fixed_temp=0.8 -out=$STEP_PATH/"solve_"$PASS_AT_K"/" -puzzles=$TST_FILE + head -$LINE_LOG_K $STEP_PATH/"solve_"$PASS_AT_K/results.json >> $LOG_FILE +done + +cat $LOG_FILE \ No newline at end of file diff --git a/ICLR2023/src/fine_tune1.sh b/ICLR2023/src/fine_tune1.sh new file mode 100755 index 0000000..7f9cb46 --- /dev/null +++ b/ICLR2023/src/fine_tune1.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# This is for finetuning a model on 1 dataset only +echo fine_tune1.sh args: $# $0 $1 $2 $3 $4 + +# All Experiment Settings - constant through the experiment run +GPU=0 # which GPU to use +MODEL="125M" # MODEL is the size of the model: 125M, 13B, 27B +EXPERIMENT=$MODEL"_PAPER1" # Name of Experiment directory under data/* and models/base-model/* to store results +ITERATION=0 # Random seed for finetuning +EPOCHS_START=1 # inclusive index of epochs to start processing at - could continue prev run by starting at prev EPOCHS_MAX+1 - 0th epoch is the default model so epoch starts at 1 +EPOCHS_MAX=10 # inclusive index of epochs to stop processing at +EPOCHS_PER_STEP=1 # How many EPOCHS through the data to do in each step +PASS_AT_K=100 # PASS_AT_K says do K trials to solve to compute Pass@K +LINE_LOG_K=11 # LINE_LOG_K is how many lines of results from solve have results for saving + +# On AMLT machines we don't specify which GPU to use +if [[ -z "${AMLT_DATA_DIR}" ]]; then + # running locally on torch2020 so we don't have AMLT enviroment variables defined, so need to set them up + AMLT_DATA_DIR=../data +fi + +if (( $# \>= 1 )) +then + GPU=$1 +fi + +echo GPU $GPU +echo EXPERIMENT $EXPERIMENT +echo ITERAION $ITERATION + +TRN_FILE=$AMLT_DATA_DIR/generated_sol_950k.txt +echo TRN_FILE $TRN_FILE +TST_FILE=$AMLT_DATA_DIR/test_228.json + +# GPU_SOLVE is the GPU we want solve to use. Solve currently only uses 1 GPU - it would be great to make it use more when they are available. +# if GPU is negative - that tells fine_tune how many GPU to use on cluster - and we need to set GPU for solve to 0 on cluster +# if GPU is positive - we are running locally on torch2020 - and we need to leave the GPU set properly +GPU_SOLVE=$GPU +if (( $GPU \< 0 )) +then + GPU_SOLVE=0 +fi +echo GPU_SOLVE $GPU_SOLVE + +# measure the base model's accuracy - don't really need to do this very often - it doesn't change + +BASE_MODEL_PATH=$AMLT_DATA_DIR/../models/gpt-neo-$MODEL +# 125M is copied locally to start +MODEL_PATH=$BASE_MODEL_PATH +MODEL_PATH=EleutherAI/gpt-neo-125M # !!! Just for paper release +# 13B is off in the cloud to start +if [[ "$MODEL" == "13B" ]]; then + MODEL_PATH=EleutherAI/gpt-neo-1.3B +fi +# 27B is off in the cloud tto start +if [[ "$MODEL" == "27B" ]]; then + MODEL_PATH=EleutherAI/gpt-neo-2.7B +fi + +echo MODEL MODEL_PATH $MODEL $MODEL_PATH + +# Training incremental means use the previous epochs model to start +# Otherwise use the base model to retrain from scratch +if (( $EPOCHS_START \> 1 )) +then + PREV_EPOCH=$((EPOCHS_START-1)) + echo $PREV_EPOCH + TEST=$AMLT_DATA_DIR/../models/gpt-neo-$MODEL/$EXPERIMENT/iter_$ITERATION/epoch_$PREV_EPOCH/finetuned + + if [ -a $TEST ] # exists + then + MODEL_PATH=$TEST + echo fine_tune.sh using previous iteration model + fi +fi + +echo "fine_tune.sh starting from NEO model at: ${MODEL_PATH}" + +# Pull all the results into 1 log file to look at more easily +LOG_FILE=$BASE_MODEL_PATH/$EXPERIMENT/iter_$ITERATION/"solve.txt" +echo solve LOG_FILE for fine_tune.sh is $LOG_FILE +rm $LOG_FILE + +for (( epochs=$EPOCHS_START; epochs<=$EPOCHS_MAX; epochs++ )) +do + EPOCH_NAME="epoch_"$epochs + EPOCHS_STEP=$(($EPOCHS_PER_STEP * $epochs)) + python fine_tune.py -train_txt=$TRN_FILE -gpu=$GPU -output_dir=$BASE_MODEL_PATH/$EXPERIMENT/iter_$ITERATION -subdir=$EPOCH_NAME -model_path=$MODEL_PATH -epochs=$EPOCHS_STEP -seed=$ITERATION + # measure the finetuned model's accuracy + STEP_PATH=$BASE_MODEL_PATH/$EXPERIMENT/iter_$ITERATION/$EPOCH_NAME + MODEL_PATH=$STEP_PATH/finetuned + python solve.py -prefix=$AMLT_DATA_DIR/train_prefix.txt -attempts=$PASS_AT_K -model_path=$MODEL_PATH -gpu=$GPU_SOLVE -fixed_temp=0.8 -out=$STEP_PATH/"solve_"$PASS_AT_K"/" -puzzles=$TST_FILE -seed=$ITERATION -batch_size=256 + head -$LINE_LOG_K $STEP_PATH/"solve_"$PASS_AT_K/results.json >> $LOG_FILE +done + +cat $LOG_FILE \ No newline at end of file diff --git a/ICLR2023/src/gen.py b/ICLR2023/src/gen.py new file mode 100644 index 0000000..57f3216 --- /dev/null +++ b/ICLR2023/src/gen.py @@ -0,0 +1,522 @@ +from typing import List +import os +from tqdm import tqdm +import numpy as np +import json +import judge +import inspect +import random +import re +import ast +import time +from collections import Counter +from strictfire import StrictFire as Fire # aborts early on invalid arguments +import utils +import solve +import torch + +def ast_parse_quiet(s: str): + utils.silence_std_err(True) + try: + return ast.parse(s) + except: + pass + finally: + utils.silence_std_err(False) + + +def find_end(st: str): + """Takes a solution and looks for the end that would make it parse.""" + lines = st.split("\n") + for i in range(1, len(lines)): + line = lines[i] + if line and line[0] not in " \t": + lines = lines[:i] + break + ans = "\n".join(lines) + + if ast_parse_quiet("def g():" + ans): + return ans + else: + return None + + +def strip_puzzle(puz: str): + """When a puzzle is generated, it will typically be followed by extra code after the def. + This function strips that extra code, leaving just the puzzle.""" + puz = puz.strip() + match = re.search(r"\n\S", puz) # newline followed by a non-newline character + if match: + return puz[: match.start()] + return puz + + +def good_puzzles(puzzles: List[str], trivial_reject_rate, verbose=True): + """Find the puzzles that compile, have exactly one required argument of a listful type, and are non-trivial + meaning that they use the argument somewhere in the puzzle and do not return True on some trivial values. + Set trivial_reject_rate to 1 if you want to reject all puzzles""" + + # first we make sure they have a return statement and start with 'def f(' and also strip any trailing code + + n = len(puzzles) + puzzles = [strip_puzzle(p) for p in puzzles] + puzzles = [p for p in puzzles if p.startswith("def f(") and "return" in p] + + utils.info(f"{len(puzzles):,}/{n:,} = {len(puzzles) / n:.0%} puzzle passed step 1") + + # next we modify the puzzle by inserting a return True as its first line and judge if f(None) is True + # this removes puzzles with bad signatures or dangerous code as detected by the judge + + def make_true(p): + lines = p.split("\n") + lines.insert(1, " return True") + lines.append("") + lines.append("assert f(None)") + return "\n".join(lines) + + n = len(puzzles) + puzzles = [p for p, res in zip(puzzles, judge.judge_parallel([make_true(p) for p in puzzles], timeout=1)) if res] + + utils.info(f"{len(puzzles):,}/{n:,} = {len(puzzles) / n:.0%} puzzle passed step 2") + + def get_trivial_tests(p): + """determine which test to run for trivial tests based on the type, returns None if the spec is invalid""" + try: + env = {"List": List} + exec(p, env) + f = env["f"] + spec = inspect.getfullargspec(f) + ans_var_name = spec.args[0] + typ = spec.annotations[ans_var_name] + except: + return None + num_var_mentions = len(re.findall(r"\b" + ans_var_name + r"\b", p)) + if ( + len(spec.defaults or []) != len(spec.args) - 1 + or spec.varargs # need exactly one required parameter + or spec.varkw + or spec.kwonlyargs + or spec.kwonlydefaults # weird spec: *args, **kwargs + ): + return None + if random.random() > trivial_reject_rate: + return [] # don't bother to test some small fraction of the puzzles for trivial solution + if ( + num_var_mentions <= 1 + or typ is bool # need to use the answer variable other than in the spec # bool puzzles are all trivial + ): + return None + base_types = {"bool": bool, "float": float, "str": str, "int": int} + if typ not in base_types.values(): + if str(typ) == "str" and typ is not str: + return None + type_str = str(typ).replace("typing.", "") + inside = type_str.replace("List[", "").replace("]", "") + if inside not in base_types: + return None + if typ is int: + tests = list(range(-10, 101)) + elif typ is str: + tests = ["cat", "dog", "aa", "ab", "foo", "bar", "baz", ""] + elif typ is float: + tests = [-100.0, -10.0, -2.0, -1.0, -0.5, -0.1, 0.0, 0.1, 0.5, 1.0, 2.0, 10.0, 100.0] + elif typ is bool: + tests = [True, False] + else: + depth = type_str.count("List[") + if depth == 0: + return None + if inside == "int": + base = list(range(-3, 4)) + elif inside == "str": + base = ["a", "b", "foo", "bar", "baz"] + elif inside == "bool": + base = [True, False] + elif inside == "float": + base = [-1.0, -0.1, 0.0, 0.1, 0.5, 1.0, 2.0] + else: + return None + from itertools import product + + tests = [] + for r in range(3): + tests.extend(list(p) for p in product(base, repeat=r)) + for d in range(depth - 1): + tests = [[i] for i in tests] + if [] not in tests: + tests.append([]) + return tests + + n = len(puzzles) + tests = [get_trivial_tests(p) for p in puzzles] + puzzles, testss = zip(*[(p, t) for p, t in zip(puzzles, tests) if t is not None]) + + utils.info(f"{len(puzzles):,}/{n:,} = {len(puzzles) / n:.0%} puzzle passed step 3") + + # next remove puzzles with trivial solutions + # todo: also remove puzzles that raise a NameError exception?? + n = len(puzzles) + nontrivials = [] + + for p, tests in tqdm(list(zip(puzzles, testss))): + results = judge.judge_parallel( + [f"{p}\n\ntry:\n assert f({utils.stringify(t)})\nexcept NameError:\n pass" for t in tests], timeout=1 + ) + if not any(results): + nontrivials.append(p) + if verbose: + utils.info("*" * 100) + utils.info(p) + + puzzles = nontrivials + utils.info(f"{len(puzzles):,}/{n:,} = {len(puzzles) / n:.0%} puzzle passed step 3") + + return puzzles + + +def load_puzzles(filename, remove_docstring): + """Returns list of functions and solution headers, one puzzle per problem""" + JS = utils.load_json(filename) + fs = [] + sol_headers = [] + seen = set() + + for j in JS: + name = j["name"].split(":")[0] # just one puzzle per problem + if name in seen: + continue + seen.add(name) + f = j["sat"].replace("def sat", "def f") + + fs.append(f) + sol_headers.append( + j["sol_header"].replace("def sol", "def g") + ("" if remove_docstring else "\n" + j["sol_docstring"]) + ) + + return fs, sol_headers + + +def gen_from_puzzles( + filename, + n, + per_prompt, + temp, + model, + tokenizer, + remove_docstring, + max_tokens, + trivial_reject_rate, + gen_tokens=200, + stop="\ndef", + batch_size=16, +): + """ + Generate based on random selection of puzzles only. + """ + utils.info(f"Generating puzzles from puzzles") + time0 = time.time() + + fs, heads = load_puzzles(filename, remove_docstring) + + assert len(fs) == len(set(fs)), "Duplicate puzzles" + + generated = [] + SEPARATOR = "\n\n" + + with tqdm(total=n) as pbar: + it = 0 + while len(generated) < n: + # compute prompt + random.shuffle(fs) + + prompt = None + for k in range(len(fs) + 1): + entries = fs[:k] + ["def f("] + candidate_prompt = SEPARATOR.join([f.replace(" f(", f" f{i + 1}(") for i, f in enumerate(entries)]) + if utils.num_tokens(candidate_prompt, tokenizer) >= max_tokens - gen_tokens: + break + prompt = candidate_prompt + + # candidates = gpt_lib.query( + # prompt=prompt, + # n=min(per_prompt, n), + # temp=temp, + # max_tokens=gen_tokens, + # stop=stop, + # cache_only=cache_only, + # notes=(seed, it), + # engine=engine, + # ) + + num_gen = min(per_prompt, n) + + while True: # loop to decrease batch size if necessary + try: + candidates = solve.gen( # complete prompts + prompts=[prompt]*num_gen, + tokenizer=tokenizer, + model=model, + batch_size=batch_size, + temp=temp, + gen_tokens=gen_tokens + ) + break + except RuntimeError as e: + if "out of memory" in str(e).lower() or "CUBLAS_STATUS_ALLOC_FAILED" in str(e): + print(str(e)) + utils.info(f"Out of GPU memory gen.py, reducing batch size {batch_size} -> {batch_size//2}") + batch_size //= 2 + assert batch_size >= 1 + # torch.cuda.empty_cache() # not important, just lets nvidia-smi update if anything + else: + raise + + candidates = [c[len(prompt):] for c in candidates] + assert len(candidates) == num_gen + + generated += [strip_puzzle("def f(" + c) for c in candidates] + pbar.update(len(candidates)) + it += 1 + + return good_puzzles(generated, trivial_reject_rate=trivial_reject_rate) + + +def get_inputs(sat: str): + """Extacts arguments past the first from a function string + def f(a: Dict[int, str], b=12): + test + + should give 'b=12' + """ + sat = sat.replace(" -> bool", "") + first_line = sat.split("\n")[0].strip() + if not first_line.endswith("):") and "#" in first_line: + first_line = first_line[: first_line.index("#")].strip() + if not (first_line.endswith("):") and first_line.startswith("def")): + # raise WeirdInputsException("Weird puzzle, cannot extract inputs", json.dumps(sat)) + return None + arg_str = first_line[first_line.index("(") : -len("):")] + depth = 0 + for i, c in enumerate(arg_str): + if c == "," and depth == 0: + return arg_str[i + 1 :].strip() + elif c == "[": + depth += 1 + elif c == "]": + depth -= 1 + return "" + + +def get_prompts(prefix, fs, sol_headers, test_prefix=True): + """adds function numbers after prompt""" + + ans = [] + if test_prefix: + exec(prefix, dict(List=List)) + + if "def f1(" in prefix: + i = 1 + while f"def f{i}(" in prefix: + i += 1 + else: + i = "" + + assert len(sol_headers) == len(fs) + for f, head in zip(fs, sol_headers): + f = f.replace("def f(", f"def f{i}(") + head = head.replace("def g(", f"def g{i}(") + head = head.replace("def sol(", f"def g{i}(") + ans.append(f"{prefix}{f}\n\n{head}") + return ans + + +def solve_puzzles( + puzzles, + prefix, + attempts, # number of attempts to solve each puzzle + model, + tokenizer, + temp, + solve_tokens=150, + timeout=1.0, +): + + stop = "\nassert" + + utils.info("=" * 100) + utils.info(f"Solving with {utils.num_tokens(prefix, tokenizer)} prefix tokens") + time0 = time.time() + + utils.info(f"Solving {len(puzzles)} given directly") + + sol_headers = [f"def g({get_inputs(f)}):" for f in puzzles] + prefix = re.sub(r" +$", "", (prefix or "").lstrip(), flags=re.M) # delete leading/trailing whitespace on each line + prompts = get_prompts(prefix, puzzles, sol_headers) + + all_results = [] + for p_num, (f, head, prompt) in tqdm(enumerate(zip(puzzles, sol_headers, prompts)), total=len(puzzles)): + res = solve.gen( # complete prompts + prompts=[prompt]*attempts, + tokenizer=tokenizer, + model=model, + batch_size=4, + temp=temp, + gen_tokens=solve_tokens + ) + res = [r[len(prompt):] for r in res] + assert len(res) == attempts + + valids = [(find_end(g), i) for i, g in enumerate(res)] + valids = [(g, i) for (g, i) in valids if g is not None] + # double parentheses are necessary to avoid cheating where it changes default parameters :-) + if "def f1(" in prompt: + for kk in range(1, 10000): + if f"def f{kk}(" not in prompt: + break + kk -= 1 + else: + kk = "" + valids = [(g.replace(f"f{kk}(", "f("), i) for (g, i) in valids] + results = judge.judge_parallel( + [f"{f}\n\n{head}{g}\n\nassert test_puzzle(f, g())" for g, _i in valids], timeout=timeout + ) + successes = [g for (g, i), res in zip(valids, results) if res] + failures = [g for (g, i), res in zip(valids, results) if not res] + all_results.append((f, successes, failures)) + # if curr: + # ans1 = [a for a, _i in curr] + # if verbose: + # utils.info(p_num, "-" * 80) + # utils.info(strip_param_annotations(f)) + # summary = [(a if c == 1 else f"{a} ({c} times)") for a, c in Counter(ans1).most_common(10)] + # utils.info(f"{len(curr)} sols, first at attempt #{curr[0][1]}:: {' | '.join(summary)}"[:200]) + + n_sol = sum(bool(s) for f, s, _ in all_results) + n_suc = sum(len(s) for f, s, _ in all_results) + utils.info(f"Solved {n_sol:,}/{len(puzzles):,} puzzles with a total of {n_suc:,} solutions.") + utils.info() + + return all_results + + +def gen( + out="../outputs/gen//", + n=100_000, + seed=0, + trivial_reject_rate=0.95, + temp=0.9, + temp_solve=None, + gpu=0, + train="../data/155_train.json", + prefix="../data/train_prefix.txt", + remove_docstring=True, + model_path="EleutherAI/gpt-neo-125M", + model_path_solve=None, + max_tokens=2048, + per_prompt=64, + attempts=128, + only_good=False +): + """ + Run the generator with given seed + + outfilename: where to write the output + n: number of puzzles to generate, actual number of puzzles will be smaller after filtering + seed: random seed + trivial_reject_rate: what fraction of trival puzzles to reject + temp: temperature for generation + temp_solve: temperature for solving (if different than temp, default=None means same as generation temp) + gpu: the gpu to use (default 0) + train: path to training data (default: 155_train.json) + prefix: text filename containing prompt (default: ../data/train_prefix.txt) + remove_docstring: whether to remove docstrings from puzzles (default=True) + model_path: path to model for generating puzzles + model_path_solve: path to model for solving puzzles (if different than model_path for generation) + max_tokens: maximum number of tokens that can fit in a prompt + per_prompt: number of puzzles to generate per prompt + attempts: number of solutions to generate per puzzle + """ + params = locals().copy() # store parameters + utils.info("PARAMETERS:") + utils.info(params) + + if seed != -1: + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + + torch.cuda.set_device(int(gpu)) + tokenizer = utils.load_tokenizer(model_path) + model = solve.load_model(model_path, pad_token_id=tokenizer.eos_token_id) # pad_token_id removes wrnings + + output_path = utils.create_experiment_outpath(out, False) + + prefix = utils.load_text_file(prefix).strip() if prefix else "" + if prefix: + prefix += "\n\n" + + time0 = time.time() + + puzzles = gen_from_puzzles( + filename=train, + n=n, + per_prompt=per_prompt, + temp=temp, + model=model, + tokenizer=tokenizer, + trivial_reject_rate=trivial_reject_rate, + max_tokens=max_tokens, + remove_docstring=remove_docstring, + ) + num_puzzles = len(puzzles) + utils.info(f"Generated {num_puzzles:,} puzzles.") + + puzzles_and_solutions = solve_puzzles( + puzzles, + prefix=prefix, + attempts=attempts, + model=model, + tokenizer = tokenizer, + temp=(temp_solve or temp), + ) + + puzzles_and_solutions.sort(key=lambda z: (len(z[1]), len(z[0]))) + + if (not only_good): + out_filename = os.path.join(output_path, "puzzles.json") + + utils.save_json(puzzles_and_solutions, out_filename) + + out_filename2 = out_filename.replace(".json", ".txt").replace(".gz", "") + with open(out_filename2, "w", encoding="utf8") as file: + for f, gs, *rest in puzzles_and_solutions: + if rest: + [rest] = rest + print(len(gs), len(rest), "/" * 100, file=file) + else: + print(len(gs), "=" * 100, file=file) + print(f, file=file) + print(file=file) + for g in (gs + rest)[:2]: + print(g, file=file) + print(file=file) + + utils.info("Wrote results to {}.".format(out_filename)) + utils.info("Wrote results to {}.".format(out_filename2)) + + # generating puzzles and solutions on cluster we want to limit the amount of data produced + # save out only the puzzles with their good solutions, forget bad solutions and unsolved problems. + gp_gs = [] + for f, gs, *rest in puzzles_and_solutions: + if len(gs) > 0: + gp_gs.append((f, gs, [])) + + out_filename = os.path.join(output_path, "good_puzzles_" + "R0_" + str(gpu) + "_" + time.strftime("%y-%m-%d-%H-%M-%S") + ".json") + utils.save_json(gp_gs, out_filename) + + time1 = time.time() + utils.info(f"Took {time1 - time0:.3f} seconds.") + utils.info(f"Saved as file {out_filename}") + +if __name__ == "__main__": + Fire(gen) diff --git a/ICLR2023/src/gen.sh b/ICLR2023/src/gen.sh new file mode 100755 index 0000000..84ad2ea --- /dev/null +++ b/ICLR2023/src/gen.sh @@ -0,0 +1,93 @@ +#!/bin/bash +# Grab the configuration variables +. babysit.sh + +if [[ -z "${AMLT_DATA_DIR}" ]]; then + # running locally on torch2020 we don't have AMLT enviroment variables defined, set them up + AMLT_DATA_DIR="../data" +fi + +echo RANK is: +echo $RANK + +if [[ -z "${RANK}" ]]; then + # running locally on torch2020 we don't have AMLT enviroment variables defined, set them up + RANK=0 +fi + +GPU=$RANK +PUZZLE_CNT=32 +SOLUTION_CNT=32 +FULLNAME="125M_RL_TEST---0" + +echo $# $0 $1 $2 $3 $4 +if (( $# \>= 1 )) +then + GPU=$1 +fi + +echo $RANK +echo $GPU + +if (( $# \>= 2 )) +then + PUZZLE_CNT=$2 +fi + +if (( $# \>= 3 )) +then + SOLUTION_CNT=$3 +fi + +if (( $# \>= 4 )) +then + FULLNAME=$4 + +fi + +RANDOM_SEED=-1 + +if (( $# \>= 5 )) +then + RANDOM_SEED=$5 + echo "Random seed is $RANDOM_SEED" +fi + +SPLIT=(${FULLNAME//---/ }) +EXPERIMENT=${SPLIT[0]} +ITERATION=${SPLIT[1]} +OUTPATH=$AMLT_DATA_DIR/$EXPERIMENT/iter_$ITERATION + +echo GPU $GPU +echo EXPERIMENT $EXPERIMENT +echo ITERAION $ITERATION +echo OUTPATH $OUTPATH + +BASE_MODEL_PATH=$AMLT_DATA_DIR/../models/gpt-neo-$MODEL +# 125M is copied locally to start +MODEL_PATH=$BASE_MODEL_PATH +MODEL_PATH=EleutherAI/gpt-neo-125M # !!! Just for paper release +# 13B is off in the cloud to start +if [[ "$MODEL" == "13B" ]]; then + MODEL_PATH=EleutherAI/gpt-neo-1.3B +fi +# 27B is off in the cloud tto start +if [[ "$MODEL" == "27B" ]]; then + MODEL_PATH=EleutherAI/gpt-neo-2.7B +fi + +echo MODEL MODEL_PATH $MODEL $MODEL_PATH + +PREV_ITERATION=$((ITERATION-1)) +echo $PREV_ITERATION +TEST=$AMLT_DATA_DIR/../models/gpt-neo-$MODEL/$EXPERIMENT/iter_$PREV_ITERATION/epoch_$EPOCHS_MAX/finetuned + +if [ -a $TEST ] # exists +then + MODEL_PATH=$TEST + echo fine_tune.sh using previous iteration model +fi + +echo "gen.sh using NEO model at: ${MODEL_PATH}" + +python gen.py -out="$OUTPATH" -n=$PUZZLE_CNT -seed=$RANDOM_SEED -gpu=$GPU -train=$AMLT_DATA_DIR/155_train.json -prefix=$AMLT_DATA_DIR/train_prefix.txt -model_path=$MODEL_PATH -attempts=$SOLUTION_CNT -only_good=True diff --git a/ICLR2023/src/judge.py b/ICLR2023/src/judge.py new file mode 100644 index 0000000..2ff4c3a --- /dev/null +++ b/ICLR2023/src/judge.py @@ -0,0 +1,176 @@ +from utils import load_json +from pebble import ProcessPool +import multiprocessing as mp +from concurrent.futures import TimeoutError +from typing import List, Set, Tuple, Dict + +import utils +import sys +import re +from copy import deepcopy + +sys.setrecursionlimit(5000) + + +def no_print(*_args, **_kwargs): + pass + + +def run_judge(judge, f, tests): + answer_type = list(judge.__annotations__.values())[0] + for x in tests: + y = f(**deepcopy(x)) # so f cannot cheat and change the input x + if not utils.type_check(y, answer_type): + raise TypeError + assert judge(y, **x) is True, f"{f} failed on test {x}" + + +_ENV = dict( + List=List, + Set=Set, + Tuple=Tuple, + Dict=Dict, + type_check=utils.type_check, + run_judge=run_judge, + test_puzzle=utils.test_puzzle, + os=None, + sys=None, + input=None, + open=None, + print=no_print, + compile=None, + copyright=None, +) + +_UNSAFE = ["builtin", "__class", "open("] +_SAFE_IMPORTS = {"collections", "copy", "hashlib", "math", "random", "re", "string", "typing"} + +MAX_WORKERS = mp.cpu_count() // 2 + + + +def unsafe_imports(code): + """Check if code imports any unsafe modules. + + Args: + code (str): The code to check. + + Returns: + bool: True if code imports unsafe modules. + """ + if "import" not in code: + return False + for line in code.split("\n"): + if "import" in line: + match = re.search(r"^\s*from\s+([\w\.]+)\s+import\s", line) + if match: + modules = [match.group(1)] + else: + match = re.search(r"^\s*import\s+(.+)", line) + if match: + modules = match.group(1).split(",") + else: + return True + if any(m.strip() not in _SAFE_IMPORTS for m in modules): + return True + return False + + +def _judge(code_env): + code, env = code_env + if unsafe_imports(code) or any(u in code for u in _UNSAFE): + return False, Exception(f"unsafe code"), code + try: + exec(code, env.copy()) + return True, None, code + except Exception as e: + return False, e, code + + +def judge_parallel(src_codes, timeout, max_workers=MAX_WORKERS, env=_ENV): + codes = utils.dedup(src_codes) + utils.info( + f"Judging {len(src_codes):,} codes ({len(src_codes)-len(codes):,} duplicates) with {max_workers} workers" + ) + successes = set() + + # print("writing to file for debugging before judging") + # from train import save_json + # + # save_json(new_codes, "results/tmp/new_codes.json") + utils.silence_std_err(True) + with ProcessPool(max_workers=max_workers) as pool: + future = pool.map(_judge, [(code, env) for code in codes], timeout=timeout) + + results = future.result() + i = 0 + while True: + try: + success, exc, code = next(results) + if success: + successes.add(codes[i]) + except StopIteration: + break + except (TimeoutError, Exception) as error: + pass + assert i < len(codes) + i += 1 + assert i == len(codes) + utils.silence_std_err(False) + return [code in successes for code in src_codes] + + + + +def test(): + import time + + tests = [ + ("def sol(a: int=10000200001):\n return (list(range(3 * a))[str(a)])\nx = sol()", False), + ("print(12)", True), + ("while True: pass", False), + ("def sol(): sol()\nsol()", False), + ("2+2", True), + ("""1+1""", True), + ("""assert False,'cats'""", False), + ("""assert False""", False), + ("""1[2]""", False), + ("""1/0""", False), + ( + """while True: + pass""", + False, + ), + ( + """for i in range(10**4): + pass""", + True, + ), + ("print('hello')", True), + ] + + scores = {} + tests2 = tests + pad = " " + for _ in range(6): + print(f"n={len(tests2)} timing test" + "*" * 20) + times = [] + for max_workers in [4, 16, 32, 64, 128]: + time0 = time.perf_counter() + res = judge_parallel([test for test, r in tests2], timeout=1, max_workers=max_workers) + for (test, expected), r in zip(tests2, res): + assert expected == r, f"Failed expected {expected}, got {r} for {test}" + + times.append((max_workers, time.perf_counter() - time0)) + + scores[len(tests2)] = times + tests2 = tests2 + [(t + pad, r) for (t, r) in tests2] + pad = pad * 2 + print("mp.cpu_count() =", mp.cpu_count()) + + for n, times in scores.items(): + print(n, "tests, [(max_workers, time)] =", times) + + +if __name__ == "__main__": + test() diff --git a/ICLR2023/src/neo_train.py b/ICLR2023/src/neo_train.py new file mode 100644 index 0000000..48fea64 --- /dev/null +++ b/ICLR2023/src/neo_train.py @@ -0,0 +1,452 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2020 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...) on a text file or a dataset. + +Here is the full list of checkpoints on the hub that can be fine-tuned by this script: +https://huggingface.co/models?filter=causal-lm +""" +# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments. + +"""This file is based on: https://github.com/huggingface/transformers/blob/1b5ce1e63b7bd4382cd1b4fdcca72d50f8b29494/examples/language-modeling/run_clm.py + +There were only two lines changed, both have the comment # CHANGED: added +""" + +import logging +import math +import os +import sys +from dataclasses import dataclass, field +from typing import Optional + +from datasets import load_dataset + +import transformers +from transformers import ( + CONFIG_MAPPING, + MODEL_FOR_CAUSAL_LM_MAPPING, + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, + HfArgumentParser, + Trainer, + TrainingArguments, + default_data_collator, + set_seed, +) +from transformers.trainer_utils import get_last_checkpoint, is_main_process +from transformers.utils import check_min_version + + +# Will error if the minimal version of Transformers is not installed. Remove at your own risks. +check_min_version("4.5.0.dev0") + +logger = logging.getLogger(__name__) + + +MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys()) +MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) + + +@dataclass +class ModelArguments: + """ + Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. + """ + + model_name_or_path: Optional[str] = field( + default=None, + metadata={ + "help": "The model checkpoint for weights initialization." + "Don't set if you want to train a model from scratch." + }, + ) + model_type: Optional[str] = field( + default=None, + metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, + ) + config_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} + ) + tokenizer_name: Optional[str] = field( + default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} + ) + cache_dir: Optional[str] = field( + default=None, + metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, + ) + use_fast_tokenizer: bool = field( + default=True, + metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, + ) + model_revision: str = field( + default="main", + metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, + ) + use_auth_token: bool = field( + default=False, + metadata={ + "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " + "with private models)." + }, + ) + + +@dataclass +class DataTrainingArguments: + """ + Arguments pertaining to what data we are going to input our model for training and eval. + """ + + dataset_name: Optional[str] = field( + default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} + ) + dataset_config_name: Optional[str] = field( + default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} + ) + train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."}) + validation_file: Optional[str] = field( + default=None, + metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."}, + ) + max_train_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of training examples to this " + "value if set." + }, + ) + max_val_samples: Optional[int] = field( + default=None, + metadata={ + "help": "For debugging purposes or quicker training, truncate the number of validation examples to this " + "value if set." + }, + ) + + block_size: Optional[int] = field( + default=None, + metadata={ + "help": "Optional input sequence length after tokenization." + "The training dataset will be truncated in block of this size for training." + "Default to the model max input length for single sentence inputs (take into account special tokens)." + }, + ) + overwrite_cache: bool = field( + default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} + ) + validation_split_percentage: Optional[int] = field( + default=5, + metadata={ + "help": "The percentage of the train set used as validation set in case there's no validation split" + }, + ) + preprocessing_num_workers: Optional[int] = field( + default=None, + metadata={"help": "The number of processes to use for the preprocessing."}, + ) + + def __post_init__(self): + if self.dataset_name is None and self.train_file is None and self.validation_file is None: + raise ValueError("Need either a dataset name or a training/validation file.") + else: + if self.train_file is not None: + extension = self.train_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." + if self.validation_file is not None: + extension = self.validation_file.split(".")[-1] + assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file." + + +def main(): + # See all possible arguments in src/transformers/training_args.py + # or by passing the --help flag to this script. + # We now keep distinct sets of args, for a cleaner separation of concerns. + + parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments)) + if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): + # If we pass only one argument to the script and it's the path to a json file, + # let's parse it to get our arguments. + model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) + else: + model_args, data_args, training_args = parser.parse_args_into_dataclasses() + + # Detecting last checkpoint. + last_checkpoint = None + if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir: + last_checkpoint = get_last_checkpoint(training_args.output_dir) + if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0: + raise ValueError( + f"Output directory ({training_args.output_dir}) already exists and is not empty. " + "Use --overwrite_output_dir to overcome." + ) + elif last_checkpoint is not None: + logger.info( + f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " + "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." + ) + + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], + ) + logger.setLevel(logging.INFO if is_main_process(training_args.local_rank) else logging.WARN) + + # Log on each process the small summary: + logger.warning( + f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" + ) + # Set the verbosity to info of the Transformers logger (on main process only): + if is_main_process(training_args.local_rank): + transformers.utils.logging.set_verbosity_info() + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + logger.info("Training/evaluation parameters %s", training_args) + + # Set seed before initializing model. + set_seed(training_args.seed) + + # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) + # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ + # (the dataset will be downloaded automatically from the datasets Hub). + # + # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called + # 'text' is found. You can easily tweak this behavior (see below). + # + # In distributed training, the load_dataset function guarantee that only one local process can concurrently + # download the dataset. + if data_args.dataset_name is not None: + # Downloading and loading a dataset from the hub. + datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name) + if "validation" not in datasets.keys(): + datasets["validation"] = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + split=f"train[:{data_args.validation_split_percentage}%]", + ) + datasets["train"] = load_dataset( + data_args.dataset_name, + data_args.dataset_config_name, + split=f"train[{data_args.validation_split_percentage}%:]", + ) + else: + data_files = {} + if data_args.train_file is not None: + data_files["train"] = data_args.train_file + if data_args.validation_file is not None: + data_files["validation"] = data_args.validation_file + extension = ( + data_args.train_file.split(".")[-1] + if data_args.train_file is not None + else data_args.validation_file.split(".")[-1] + ) + if extension == "txt": + extension = "text" + datasets = load_dataset(extension, data_files=data_files) + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at + # https://huggingface.co/docs/datasets/loading_datasets.html. + + # Load pretrained model and tokenizer + # + # Distributed training: + # The .from_pretrained methods guarantee that only one local process can concurrently + # download model & vocab. + + config_kwargs = { + "cache_dir": model_args.cache_dir, + "revision": model_args.model_revision, + "use_auth_token": True if model_args.use_auth_token else None, + } + if model_args.config_name: + config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) + elif model_args.model_name_or_path: + config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) + else: + config = CONFIG_MAPPING[model_args.model_type]() + logger.warning("You are instantiating a new config instance from scratch.") + + config.gradient_checkpointing = True # CHANGED: added + config.use_cache = False # CHANGED: added + + tokenizer_kwargs = { + "cache_dir": model_args.cache_dir, + "use_fast": model_args.use_fast_tokenizer, + "revision": model_args.model_revision, + "use_auth_token": True if model_args.use_auth_token else None, + } + if model_args.tokenizer_name: + tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) + elif model_args.model_name_or_path: + tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) + else: + raise ValueError( + "You are instantiating a new tokenizer from scratch. This is not supported by this script." + "You can do it from another script, save it, and load it from here, using --tokenizer_name." + ) + + if model_args.model_name_or_path: + model = AutoModelForCausalLM.from_pretrained( + model_args.model_name_or_path, + from_tf=bool(".ckpt" in model_args.model_name_or_path), + config=config, + cache_dir=model_args.cache_dir, + revision=model_args.model_revision, + use_auth_token=True if model_args.use_auth_token else None, + ) + else: + logger.info("Training new model from scratch") + model = AutoModelForCausalLM.from_config(config) + + model.resize_token_embeddings(len(tokenizer)) + + # Preprocessing the datasets. + # First we tokenize all the texts. + if training_args.do_train: + column_names = datasets["train"].column_names + else: + column_names = datasets["validation"].column_names + text_column_name = "text" if "text" in column_names else column_names[0] + + def tokenize_function(examples): + return tokenizer(examples[text_column_name]) + + tokenized_datasets = datasets.map( + tokenize_function, + batched=True, + num_proc=data_args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not data_args.overwrite_cache, + ) + + if data_args.block_size is None: + block_size = tokenizer.model_max_length + if block_size > 1024: + logger.warn( + f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). " + "Picking 1024 instead. You can change that default value by passing --block_size xxx." + ) + block_size = 1024 + else: + if data_args.block_size > tokenizer.model_max_length: + logger.warn( + f"The block_size passed ({data_args.block_size}) is larger than the maximum length for the model" + f"({tokenizer.model_max_length}). Using block_size={tokenizer.model_max_length}." + ) + block_size = min(data_args.block_size, tokenizer.model_max_length) + + # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. + def group_texts(examples): + # Concatenate all texts. + concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can + # customize this part to your needs. + total_length = (total_length // block_size) * block_size + # Split by chunks of max_len. + result = { + k: [t[i : i + block_size] for i in range(0, total_length, block_size)] + for k, t in concatenated_examples.items() + } + result["labels"] = result["input_ids"].copy() + return result + + # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a remainder + # for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value might be slower + # to preprocess. + # + # To speed up this part, we use multiprocessing. See the documentation of the map method for more information: + # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map + + lm_datasets = tokenized_datasets.map( + group_texts, + batched=True, + num_proc=data_args.preprocessing_num_workers, + load_from_cache_file=not data_args.overwrite_cache, + ) + + if training_args.do_train: + if "train" not in tokenized_datasets: + raise ValueError("--do_train requires a train dataset") + train_dataset = lm_datasets["train"] + if data_args.max_train_samples is not None: + train_dataset = train_dataset.select(range(data_args.max_train_samples)) + + if training_args.do_eval: + if "validation" not in tokenized_datasets: + raise ValueError("--do_eval requires a validation dataset") + eval_dataset = lm_datasets["validation"] + if data_args.max_val_samples is not None: + eval_dataset = eval_dataset.select(range(data_args.max_val_samples)) + + # Initialize our Trainer + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_dataset if training_args.do_train else None, + eval_dataset=eval_dataset if training_args.do_eval else None, + tokenizer=tokenizer, + # Data collator will default to DataCollatorWithPadding, so we change it. + data_collator=default_data_collator, + ) + + # Training + if training_args.do_train: + if last_checkpoint is not None: + checkpoint = last_checkpoint + elif model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path): + checkpoint = model_args.model_name_or_path + else: + checkpoint = None + train_result = trainer.train(resume_from_checkpoint=checkpoint) + trainer.save_model() # Saves the tokenizer too for easy upload + + metrics = train_result.metrics + + max_train_samples = ( + data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) + ) + metrics["train_samples"] = min(max_train_samples, len(train_dataset)) + + trainer.log_metrics("train", metrics) + trainer.save_metrics("train", metrics) + trainer.save_state() + + # Evaluation + if training_args.do_eval: + logger.info("*** Evaluate ***") + + metrics = trainer.evaluate() + + max_val_samples = data_args.max_val_samples if data_args.max_val_samples is not None else len(eval_dataset) + metrics["eval_samples"] = min(max_val_samples, len(eval_dataset)) + perplexity = math.exp(metrics["eval_loss"]) + metrics["perplexity"] = perplexity + + trainer.log_metrics("eval", metrics) + trainer.save_metrics("eval", metrics) + + +def _mp_fn(index): + # For xla_spawn (TPUs) + main() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/ICLR2023/src/preprocess.py b/ICLR2023/src/preprocess.py new file mode 100644 index 0000000..20d51b3 --- /dev/null +++ b/ICLR2023/src/preprocess.py @@ -0,0 +1,111 @@ +import os +import random +import utils +import glob +import json +from typing import List +from strictfire import StrictFire as Fire # aborts early on invalid arguments + +class WeirdInputsException(Exception): + pass + +def get_inputs(sat: str): + """Extacts arguments past the first from a function string + def f(a: Dict[int, str], b=12): + test + + should give 'b=12' + """ + sat = sat.replace(" -> bool", "") + first_line = sat.split("\n")[0].strip() + if not first_line.endswith("):") and "#" in first_line: + if "):" in first_line: + n = first_line.index("):") + if "#" in first_line[n:]: + first_line = first_line[:n + first_line[n:].index("#")].strip() + else: + first_line = "" # raises exception below + if not (first_line.endswith("):") and first_line.startswith("def")): + raise WeirdInputsException("Weird puzzle, cannot extract inputs", json.dumps(sat)) + arg_str = first_line[first_line.index("("):-len("):")] + depth = 0 + for i, c in enumerate(arg_str): + if c == "," and depth == 0: + return arg_str[i + 1:].strip() + elif c == "[": + depth += 1 + elif c == "]": + depth -= 1 + return "" + +def main( + path, + filtered_name="gen_ps_filtered.txt", + unfiltered_name=None, # "gen_ps_unfiltered.txt", + max_sols_per_puzzle=8, + seed=0): + """ + Merge the puzzles from the given json input files. Examples: + python preprocess.py -unfiltered_name=gen_ps_unfiltered.txt -- ~/aicoder/data/gen_125M_RL/*.json + + path: path to search for json files + filtered_name: path to write puzzles, unfiltered (default: gen_ps_filtered.txt) + unfiltered_name: path to write filtered puzzles (optional) + max_sols_per_puzzle: maximum number of solutions per puzzle (default 8) + seed: random seed (default 0) for reproducibility + infiles: list of files to read puzzles from (like /path/*.json) + """ + + # Make the path so enumeration off that path works, even if it doesn't exist yet + filtered_path = os.path.join(path, filtered_name) + os.makedirs(os.path.dirname(filtered_path), exist_ok=True) + + codes = [] + all_codes = [] + + # grab all the iter_* data for just this experiment + gen_paths = [os.path.join(path, "../*/*.json")] + + # grab just the data for this iter_# for this experiment + # gen_paths = [os.path.join(path, "*.json")] + + for gen_path in gen_paths: + for filename in sorted(glob.glob(gen_path)): + print("preprocess filename:", filename) + js = utils.load_json(filename) + for f, successes, failures in js: + for body in sorted(utils.dedup(successes), key=len)[:max_sols_per_puzzle]: + + try: + g = f"def g({get_inputs(f)}):{body}".strip("\\").strip() + codes.append(f + "\n\n" + g + "\n\n" + "assert f(g())\n\n") + except WeirdInputsException: + print("failed to create g") + pass + print(f"{len(codes):,}/{len(all_codes):,} puzzles of preprocessing {filename}") + + print("len(codes)", len(codes)) + codes = utils.dedup(codes) + print("len(codes) after dedup", len(codes)) + + random.shuffle(codes) + random.shuffle(all_codes) + + # Make it the same number of examples as we got from codex + codes = codes[:950511] + print("len(codes) after truncation", len(codes)) + + code = "".join(codes) + + utils.save_text_file(code, filtered_path) + print(f"Wrote filtered results to {filtered_path}") + + assert unfiltered_name is None, "Not supported now, go back to earlier version" + if unfiltered_name: + unfiltered_path = os.path.join(path, filtered_name) + utils.save_text_file("".join(all_codes), unfiltered_path) + print(f"Wrote unfiltered results to {unfiltered_path}") + + +if __name__ == "__main__": + Fire(main) diff --git a/ICLR2023/src/requirements.txt b/ICLR2023/src/requirements.txt new file mode 100644 index 0000000..af04fa0 --- /dev/null +++ b/ICLR2023/src/requirements.txt @@ -0,0 +1,10 @@ +tqdm +orderedset +numpy +astor +sklearn +fire +strictfire +pebble +deepspeed == 0.6.1 +transformers == 4.30.0 \ No newline at end of file diff --git a/ICLR2023/src/solve.py b/ICLR2023/src/solve.py new file mode 100644 index 0000000..1cbe1dc --- /dev/null +++ b/ICLR2023/src/solve.py @@ -0,0 +1,314 @@ +from strictfire import StrictFire as Fire # aborts early on invalid arguments +import os +import time +import torch +from transformers import GPTNeoForCausalLM +import deepspeed +import gc +import tqdm +from typing import List +import ast +import random +import numpy as np +import judge +import utils + + +def load_model(model_path, pad_token_id=None, mp_size=1): + start_time = time.time() + model = GPTNeoForCausalLM.from_pretrained(model_path, pad_token_id=pad_token_id).half() + utils.info(f"Loaded model in {time.time()-start_time:.1f}s") + + print("deepspeed version", deepspeed.__version__) + print("mp_size", mp_size) + + if (deepspeed.__version__[:5] >= "0.6.0"): + # This works on deepspeed 0.6.0 and later - deepspeed updated their tutorials and docs + return deepspeed.init_inference(model, mp_size=mp_size, dtype=torch.float16, replace_method="auto", replace_with_kernel_inject=True).module + else: + # This works on deepspeed 0.5.1 and 0.5.6 + return deepspeed.init_inference(model, mp_size=mp_size, dtype=torch.float16, replace_method="auto").module + +def get_puz_num_str(prefix: str): + """ + If the prefix has def f1 ... def f5, it returns "6", otherwise it returns "" + """ + if "def f1(" in prefix: + i = 1 + while f"def f{i}(" in prefix: + i += 1 + return str(i) + else: + return "" + + +def gen_prompts(fs: List[str], prefix: str) -> str: + # extract everything after first argument + ans = [] + + puz_num_str = get_puz_num_str(prefix) + + for f in fs: + args = f[f.index("(") + 1 : f.index("):\n")] + if "," in args: + inputs = args[args.index(",") + 1 :].strip() + else: + inputs = "" + + f_new = f.replace("def f(", f"def f{puz_num_str}(").strip() + prompt = f"{prefix}{f_new}\n\ndef g{puz_num_str}({inputs}):" + + ans.append(prompt) + + return ans + + +def trim_gen_texts(gen_texts, prefix): + """ + Trim the generated texts to remove the prefix and find the end of the generated function + """ + # utils.silence_std_err(True) + + p_num = get_puz_num_str(prefix) + + assert all(t.startswith(prefix) for t in gen_texts) + + texts = [text[len(prefix) :] for text in gen_texts] # remove prefix + # for t in texts: + # print("====") + # print(t) + + texts = [text.replace(f"f{p_num}(", "f(").replace(f"g{p_num}(", "g(") for text in texts] # remove f + + gs = [] + for t in texts: # for sat, t in zip(sats, texts): + # print("-t", t) + # print("-f", f) + # f = sat.replace("def sat(", "def f(") + # assert t.strip().startswith(f.strip()) + # assert t.startswith(f) + gs.append(t[t.index("def g(") :].strip()) + + results = [] + for st in gs: + lines = st.split("\n") + for i in range(1, len(lines)): + line = lines[i] + if line and line[0] not in " \t": + lines = lines[:i] + break + g = "\n".join(lines).strip() + + try: + ast.parse(g) + results.append(g) + except: + results.append(None) + + return results + + +def gen(prompts, tokenizer, model, batch_size, temp, gen_tokens): + + # print("generating") + start_time = time.time() + + gen_texts = [] + for start_i in range(0, len(prompts), batch_size): + cur_prompts = prompts[start_i : start_i + batch_size] + tokens = tokenizer(cur_prompts, padding=True, return_tensors="pt").input_ids.cuda() + max_length = tokens.shape[1] + gen_tokens # ids.shape[1] is num_tokens of the longest prompt + # print(tokenizer.batch_decode(ids)[0]) + with torch.no_grad(): + assert max_length <= 2048 + generated = model.generate( + tokens, + do_sample=(temp != 0), + min_length=max_length, # total length including prompt + max_length=max_length, + temperature=(temp or None), # use None if temp == 0.0 + use_cache=True, + # num_return_sequences=num_return, + ) + # possibly todo: trim all generations to gen_tokens length? + gen_texts.extend(tokenizer.batch_decode(generated, skip_special_tokens=True, clean_up_tokenization_spaces=False)) + duration = time.time() - start_time + utils.info(f"Generated {len(gen_texts)} texts in {duration:.1f}s") + assert len(gen_texts) == len(prompts) + assert all(t.startswith(prompt) for t, prompt in zip(gen_texts, prompts)) + return gen_texts + + +def solve( + puzzles="../data/test_228.json", + prefix="../data/train_prefix.txt", + attempts=10, + fixed_temp=None, + model_path="EleutherAI/gpt-neo-2.7B", + gpu=0, + batch_size=4, # Sometimes shrinking the batch size doesn't work, like A100 need 8 on 2.7B to run - dies otherwise. + gen_tokens=150, + out="../outputs/solve//", + seed=0 +): + """ + Solve puzzles. Writes the results in outputs/solve/date-time folder. + + puzzles: the file containing the puzzles to solve (default: ../data/test_228.json) + prefix: text filename containing prompt (default: ../data/tutorial_prefix.txt) + attempts: number of attempts to solve each puzzle (default: 10) + fixed_temp: the temperature to use for the solver, if None it will automatically increase temperature (default: None) + model_path: the path to the model to fine tune (default "EleutherAI/gpt-neo-2.7B") + gpu: which gpu to use, currently only supports one gpu (default: 0) + batch_size: initial GPU batch size, automatically reduced if needed (default: 64) + gen_tokens: minimum number of tokens to generate per solution (default: 150) # todo make all equal + out: the path to write the output to, with filled in (default: ../outputs/solved//) + seed: random seed to use (default: 0) + """ + params = locals().copy() + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + os.environ['WORLD_SIZE'] = "4" + + if fixed_temp == 0.0: + utils.warn("*** Fixed temp is 0.0, boiling instead") + + start_time = time.time() + sats = utils.load_json(puzzles) + fs = [s.replace("def sat(", "def f(").strip() for s in sats] + prefix = utils.load_text_file(prefix).strip() if prefix else "" + if prefix: + prefix += "\n\n" + print("out", out) + output_path = utils.create_experiment_outpath(out) + + prompts = gen_prompts(fs, prefix) + prompts_by_f = {f: prompt for f, prompt in zip(fs, prompts)} + + utils.save_json(prompts, os.path.join(output_path, "prompts.json")) + results_filename = os.path.join(output_path, "results.json") + + # gpu if positive is which gpu to use + # gpu if negative is how many gpu to use + print("gpu", gpu) + if (int(gpu) < 0) : + from mpi4py import MPI + mpi_rank = MPI.COMM_WORLD.Get_rank() + mpi_size = MPI.COMM_WORLD.Get_size() + print("mpi_rank and mpi_size", mpi_rank, mpi_size) + + port = 29600 + os.environ["MASTER_ADDR"] = '127.0.0.1' + os.environ["MASTER_PORT"] = str(port) + + print("calling init_distributed") + deepspeed.init_distributed() + mp_size = abs(int(gpu)) + else: + torch.cuda.set_device(int(gpu)) # problematic line for multiple gpus + mp_size = 1 + print ('Available devices ', torch.cuda.device_count()) + print ('Current cuda device ', torch.cuda.current_device()) + + tokenizer = utils.load_tokenizer(model_path) + model = load_model(model_path, pad_token_id=tokenizer.eos_token_id, mp_size=mp_size) # pad_token_id removes warnings + + all_gen = {f: [] for f in fs} # all generated solutions for each puzzle + solutions = {} # to record the solutions + + current_fs = fs.copy() # the puzzles we are solving at the current temperature + next_fs = [] # puzzles to solve at the next temperature + + if fixed_temp: + temp = fixed_temp + delta_temp = 0.0 + else: + temp = 0.0 + delta_temp = 0.2 + + while current_fs or next_fs: + # filter out solved puzzles and puzzles that have already been tried attempts times, and puzzles to be advanced + current_fs = [f for f in current_fs if len(all_gen[f]) < attempts and f not in solutions and f not in next_fs] + if not current_fs: + current_fs, next_fs = next_fs, [] + temp += delta_temp + continue + + potential_attempts = sum([attempts - len(all_gen[f]) for f in current_fs + next_fs]) + utils.info( + f"{len(solutions):,} solved; {potential_attempts:,} potential remaining attempts; " + + f"{len(current_fs):,} at temp={temp:.2f}; " + + ("" if fixed_temp else f"{len(next_fs):,} at temp={temp+delta_temp:.2f}") + ) + while True: # loop to decrease batch size if necessary + try: + gen_texts = gen( + [prompts_by_f[f] for f in current_fs], + tokenizer, + model, + batch_size, + temp, + gen_tokens, + ) + break + except RuntimeError as e: + if "out of memory" in str(e).lower() or "CUBLAS_STATUS_ALLOC_FAILED" in str(e): + utils.info(f"Out of GPU memory solve.py, reducing batch size {batch_size} -> {batch_size//2}") + batch_size //= 2 + assert batch_size >= 1 + torch.cuda.empty_cache() # not important, just lets nvidia-smi update if anything + else: + raise + + assert len(gen_texts) == len(current_fs) + gs = trim_gen_texts(gen_texts, prefix) + for f, g in zip(current_fs, gs): + if not fixed_temp: + if (temp==0.0 or (g and any(g == g2 for g2, _temp in all_gen[f]))): + next_fs.append(f) # increase temperature when you see a repeated solution or if temperature is 0.0 + # this will also cause it to be removed from current_fs at the beginning of the next loop + all_gen[f].append([g, temp]) + parsed = [(f, g) for f, g in zip(current_fs, gs) if g] + judge_srcs = [f"{f}\n{g}\nassert test_puzzle(f, g())" for f, g in parsed] + judgments = judge.judge_parallel(judge_srcs, timeout=1.0) + assert len(judgments) == len(parsed) == len(judge_srcs) <= len(current_fs) + for (f, g), solved in zip(parsed, judgments): + assert f not in solutions + if solved: + solutions[f] = g + for f in all_gen: + if f not in solutions: + assert len(all_gen[f]) == attempts + all_gen[f].append("# UNSOLVED #") # makes len(all_gen[f]) > attempts + + scores = sorted([len(all_gen[f]) for f in solutions]) + print(f"{len(solutions):,} solved at:", scores) + passes = {} + k = 1 + while True: + passes[k] = sum(len(gens) <= k for gens in all_gen.values()) + if k == attempts: + break + k = min(2 * k, attempts) + + utils.info(f" Pass@k out of {len(fs):,} puzzles") + utils.info(" k: ", "".join(f"{k:6,}" for k in passes)) + utils.info("# solved in <= k attempts: ", "".join(f"{passes[k]:6,}" for k in passes)) + + duration_mins = (time.time() - start_time) / 60 + results = dict( + duration_mins=duration_mins, passes=passes, scores=scores, params=params, solutions=solutions, all_gen=all_gen + ) + utils.info(f"Saved results generations to '{results_filename}'. Took {duration_mins:.1f} minutes.") + utils.save_json(results, results_filename) + + # cleanup model (for multithreaded, this happens anyways when the process dies) + # here we delte the model and reset worker_data to None (really either of the two should work + # but just being extra careful) + del model + gc.collect() + torch.cuda.empty_cache() + +if __name__ == "__main__": + Fire(solve) diff --git a/ICLR2023/src/utils.py b/ICLR2023/src/utils.py new file mode 100644 index 0000000..3d46bea --- /dev/null +++ b/ICLR2023/src/utils.py @@ -0,0 +1,286 @@ +import json +import logging +import inspect +import io +import os +import sys +import time +from transformers import AutoTokenizer + + +os.environ["WANDB_DISABLED"] = "true" +os.environ["TOKENIZERS_PARALLELISM"] = "false" +my_path = os.path.dirname(__file__) + + +def load_tokenizer(model_path): + tokenizer = AutoTokenizer.from_pretrained(model_path) + tokenizer.padding_side = "left" + tokenizer.pad_token = tokenizer.eos_token + return tokenizer + + +def num_tokens(s: str, tokenizer, verbose=False): + + start_time = time.time() + if verbose: + info(f"Tokenizing {pretty_int(len(s))} chars ({pretty_int(len(s.splitlines()))} lines)") + # ans = _tokenizer(s, return_tensors="pt").input_ids.shape[1] # produces annoying warnings + ans = tokenizer(s, return_tensors="pt", max_length=10 + len(s), truncation=True).input_ids.shape[1] + + duration_mins = (time.time() - start_time)/60 + if verbose: + info(f"Num tokens: {ans:,} in {duration_mins:.2f} mins") + return ans + + +def create_experiment_outpath(out: str, bSaveCommand=True): + """ + Create the output directory and return its name. Also stores the command line in command.sh + Date format is like Jan-1-2020 + """ + output_path = str(out).replace("", time.strftime("%b%d-%H-%M-%S")) + os.makedirs(output_path, exist_ok=True) # ran into error due to non-atomic check + if bSaveCommand: + save_text_file(' '.join([sys.executable] + sys.argv) + "\n", f"{output_path}/command.sh") + # make command.sh executable: + os.chmod(f"{output_path}/command.sh", 0o755) + return output_path + +def pretty_int(n: int) -> str: + """Converts an integer to a string with commas, with M for millions and B for billions""" + if n > 1_000_000_000: + return f"{n/1_000_000_000:.1f}B" + if n > 1_000_000: + return f"{n/1_000_000:.1f}M" + return f"{n:,}" + + + +def test_puzzle(f, x): + """Checks if x is of the correct type and makes f return True (literally True, not an integer or whatever) + + :param f: Puzzle + :param x: candidate answer + :return: + """ + answer_type = list(f.__annotations__.values())[0] + if not type_check(x, answer_type): + raise TypeError + return f(x) is True + + + +def type_check(obj, typ): + """ + check if obj is of type `typ` where `typ` is a `typing` module type annotation, eg List[int] + The way we do this to be compatible across versions is we first convert the type to a string. + """ + + type_str = str(typ).replace("typing.", "") + if type_str.startswith("