From d2cf134eac1eda354cf662db9a8fb96e78916445 Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Mon, 2 Dec 2019 14:35:43 +0000 Subject: [PATCH] Add queries examples for different languages --- notebooks/ExploreQueries.ipynb | 620 +++++++++++++++++++++++++++++++++ 1 file changed, 620 insertions(+) create mode 100644 notebooks/ExploreQueries.ipynb diff --git a/notebooks/ExploreQueries.ipynb b/notebooks/ExploreQueries.ipynb new file mode 100644 index 00000000..b2ce192f --- /dev/null +++ b/notebooks/ExploreQueries.ipynb @@ -0,0 +1,620 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Qery Exploration\n", + "\n", + "This notebook shows example of queries for train and offline validate for CodeSearchNet dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "import pandas as pd\n", + "from pathlib import Path\n", + "pd.set_option('max_colwidth',300)\n", + "from pprint import pprint" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Before downloading the entire dataset, it may be useful to explore a small sample in order to understand the format and structure of the data. While the full dataset can be automatically downloaded with the `/script/setup` script located in this repo, we can alternatively download a subset of the data from S3. \n", + "\n", + "The s3 links follow this pattern:\n", + "\n", + "> https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/{python,java,go,php,ruby,javascript}.zip\n", + "\n", + "For example, the link for the `python` is:\n", + "\n", + "> https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/python.zip" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [], + "source": [ + "def print_10_docstrings(language: str):\n", + " print(f'---Print docstrings for test code snippets in {language}')\n", + " link_to_dataset_part = f'https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/{language}.zip'\n", + " !wget -Nq {link_to_dataset_part}\n", + "\n", + " zip_name = f'{language}.zip'\n", + " !unzip -oq {zip_name}\n", + " \n", + " test_file_path = f'{language}/final/jsonl/test/{language}_test_0.jsonl.gz'\n", + " # decompress this gzip file\n", + " !gzip -dfq {test_file_path}\n", + " \n", + " with open(f'{language}/final/jsonl/test/{language}_test_0.jsonl', 'r') as f:\n", + " sample_file = f.readlines()\n", + " \n", + " for i in range(0, len(sample_file), len(sample_file)//10):\n", + " print()\n", + " print(f'____{language}_{i}_____')\n", + " print(json.loads(sample_file[i])['docstring'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Python dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---Print docstrings for test code snippets in python\n", + "\n", + "____python_0_____\n", + "Extracts video ID from URL.\n", + "\n", + "____python_2217_____\n", + "Obtain the reconstruction error for the input test_data.\n", + "\n", + " :param H2OFrame test_data: The dataset upon which the reconstruction error is computed.\n", + " :param bool per_feature: Whether to return the square reconstruction error per feature.\n", + " Otherwise, return the mean square error.\n", + "\n", + " :returns: the reconstruction error.\n", + "\n", + "____python_4434_____\n", + ">>> string = 'apple orange \"banana tree\" green'\n", + " >>> splitstring(string)\n", + " ['apple', 'orange', 'green', '\"banana tree\"']\n", + "\n", + "____python_6651_____\n", + "Implements the request/response pattern via pub/sub\n", + " using a single wildcard subscription that handles\n", + " the responses.\n", + "\n", + "____python_8868_____\n", + ":param file_inp: a `filename` or ``sys.stdin``?\n", + " :param file_out: a `filename` or ``sys.stdout`?`\n", + "\n", + "____python_11085_____\n", + "Format output using *format_name*.\n", + "\n", + " This is a wrapper around the :class:`TabularOutputFormatter` class.\n", + "\n", + " :param iterable data: An :term:`iterable` (e.g. list) of rows.\n", + " :param iterable headers: The column headers.\n", + " :param str format_name: The display format to use.\n", + " :param \\*\\*kwargs: Optional arguments for the formatter.\n", + " :return: The formatted data.\n", + " :rtype: str\n", + "\n", + "____python_13302_____\n", + "Wrap a generator function in a decorator to supply line and column\n", + " information to the returned Python AST node and dependency nodes.\n", + "\n", + " Dependency nodes should likely only be included if they are new nodes\n", + " created in the same function wrapped by this function. Otherwise, dependencies\n", + " returned from e.g. calling `gen_py_ast` should be assumed to already have\n", + " their location information hydrated.\n", + "\n", + "____python_15519_____\n", + "Redirect a system stream to the provided target.\n", + "\n", + "____python_17736_____\n", + "Init client\n", + "\n", + "____python_19953_____\n", + "Gets a postcode object from the lat and long.\n", + " :param lat: The latitude to look up.\n", + " :param long: The longitude to look up.\n", + " :return: The mapping corresponding to the lat and long or none if the postcode does not exist.\n", + " :raises ApiError: When there was an error connecting to the API.\n", + " :raises CircuitBreakerError: When the circuit breaker is open.\n", + "\n", + "____python_22170_____\n", + "Converts an encoded URL to a dict.\n", + " Example: given string = 'a=1&b=2' it returns {'a': 1, 'b': 2}\n" + ] + } + ], + "source": [ + "print_10_docstrings('python')" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---Print docstrings for test code snippets in ruby\n", + "\n", + "____ruby_0_____\n", + "Returns a hash in the following format:\n", + " {\n", + " \"pod/web-1\" => [\n", + " \"Pulling: pulling image \"hello-world:latest\" (1 events)\",\n", + " \"Pulled: Successfully pulled image \"hello-world:latest\" (1 events)\"\n", + " ]\n", + " }\n", + "\n", + "____ruby_227_____\n", + "Enforces the `version_limit`, if set. Default: no limit.\n", + " @api private\n", + "\n", + "____ruby_454_____\n", + "Gather slices from params and axis according to indices.\n", + "\n", + "____ruby_681_____\n", + "Parse all results in the batch. Add records to shared list.\n", + " If the record was not found, the bins will be nil.\n", + "\n", + "____ruby_908_____\n", + "Adds the file reference with given UUID.\n", + "\n", + " @param [String] uuid UUID of the object.\n", + "\n", + "____ruby_1135_____\n", + "The main method implementing Ruby-like access methods for nested elements\n", + "\n", + "____ruby_1362_____\n", + "Stop validating at the Question node\n", + "\n", + "____ruby_1589_____\n", + "Upon a failure at the first URL, will automatically retry with the\n", + " second & third ones before finally raising an exception\n", + " Returns an HTTPResponse object\n", + "\n", + "____ruby_1816_____\n", + "Calculate an integer from a string.\n", + "\n", + " Example\n", + "\n", + " charset = SecretSharing::Charset.by_charset_string \"abc\"\n", + " charset.s_to_i \"ab\"\n", + " # => 6\n", + "\n", + " @param string [Integer] integer to convert to string\n", + " @return [String] converted string\n", + "\n", + "____ruby_2043_____\n", + "sums up statistics across all queries, indexed by model\n", + "\n", + "____ruby_2270_____\n", + "Computes pointQ = s * pointA\n" + ] + } + ], + "source": [ + "print_10_docstrings('ruby')" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---Print docstrings for test code snippets in php\n", + "\n", + "____php_0_____\n", + "Auto generated seed file.\n", + "\n", + "@return void\n", + "\n", + "____php_2839_____\n", + "Attach a function as a server method\n", + "\n", + "@param array|string $function Function name, array of function names to attach,\n", + "or SOAP_FUNCTIONS_ALL to attach all functions\n", + "@param string $namespace Ignored\n", + "@return Zend_Soap_Server\n", + "@throws Zend_Soap_Server_Exception on invalid functions\n", + "\n", + "____php_5678_____\n", + "响应命令.\n", + "\n", + "@param \\Leevel\\Kernel\\IApp $app\n", + "\n", + "____php_8517_____\n", + "Creates a default WP-CLI packages composer.json.\n", + "\n", + "@param string $composer_path Where the composer.json should be created\n", + "@return string Returns the absolute path of the newly created default WP-CLI packages composer.json.\n", + "\n", + "____php_11356_____\n", + "Gets the result set for date/pageview pairs\n", + "\n", + "@return ArrayList\n", + "\n", + "____php_14195_____\n", + "Set a bulk of input parameters from and array.\n", + "\n", + "@param array $arrayOfParameters\n", + "\n", + "____php_17034_____\n", + "Gets an array of files to lint.\n", + "\n", + "@param array $files array of files to check\n", + "@param array $directories array of directories to get the files from\n", + "\n", + "@return array\n", + "\n", + "____php_19873_____\n", + "{@inheritdoc}\n", + "\n", + "____php_22712_____\n", + "Run client script\n", + "\n", + "@param string $scriptStr\n", + "@return void\n", + "\n", + "____php_25551_____\n", + "Dispatch an event if the dispatcher is loaded\n", + "@param string $name event name to dispatch\n", + "@param EventInterface $event\n", + "@return boolean\n", + "\n", + "____php_28390_____\n", + "{@inheritdoc}\n" + ] + } + ], + "source": [ + "print_10_docstrings('php')" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---Print docstrings for test code snippets in javascript\n", + "\n", + "____javascript_0_____\n", + "Create an instance of Axios\n", + "\n", + "@param {Object} defaultConfig The default config for the instance\n", + "@return {Axios} A new instance of Axios\n", + "\n", + "____javascript_648_____\n", + "a function returning the mutations object\n", + "\n", + "@export\n", + "@param {object} userState\n", + "@returns {AnyObject} the mutations object\n", + "\n", + "____javascript_1296_____\n", + "Get all contents of the table/json file object\n", + "@param {string} arguments[0] [Table name]\n", + "@param {string} arguments[1] [Location of the database file] (Optional)\n", + "@param {Function} arguments[2] [callback function]\n", + " function getAll(tableName, callback) {\n", + "\n", + "____javascript_1944_____\n", + "find method in klass prototype chain\n", + "\n", + "____javascript_2592_____\n", + "Returns completions for markup syntaxes (HTML, Slim, Pug etc.)\n", + "@param {CodeMirror} editor\n", + "@param {CodeMirror.Position} pos Cursor position in editor\n", + "@param {Object} config Resolved Emmet config\n", + "@return {EmmetCompletion[]}\n", + "\n", + "____javascript_3240_____\n", + "Listen to chart events to save selections into to state object.\n", + "\n", + "____javascript_3888_____\n", + "Calculate the tree distance between a and b\n", + "\n", + "____javascript_4536_____\n", + "Determines if an opener paren should not have an existing space after it\n", + "@param {Object} left The paren token\n", + "@param {Object} right The token after it\n", + "@returns {boolean} True if the paren should reject the space\n", + "\n", + "____javascript_5184_____\n", + "上传对象\n", + "@param {String} accessKey\n", + "@param {String} secretKey\n", + "@param {String} bucket\n", + "@param {String} objectKey\n", + "@param {String} file\n", + "@param {Object} nosHeader\n", + "\n", + "____javascript_5832_____\n", + "Overriding Backbone.View's undelegateEvents to handle unbinding the `triggers`, `modelEvents`, and `collectionEvents` config\n", + "\n", + "____javascript_6480_____\n", + "Create a new collection\n", + "\n", + "@param {Object} options the options\n" + ] + } + ], + "source": [ + "print_10_docstrings('javascript')" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---Print docstrings for test code snippets in java\n", + "\n", + "____java_0_____\n", + "Makes sure the fast-path emits in order.\n", + "@param value the value to emit or queue up\n", + "@param delayError if true, errors are delayed until the source has terminated\n", + "@param disposable the resource to dispose if the drain terminates\n", + "\n", + "____java_2690_____\n", + "Generates a JavaScript reverse router.\n", + "\n", + "@param name the router's name\n", + "@param routes the reverse routes for this router\n", + "@return the router\n", + "@deprecated Deprecated as of 2.7.0. Use {@link #create(String, String, String,\n", + "JavaScriptReverseRoute...)} instead.\n", + "\n", + "____java_5380_____\n", + "Set the subset of columns to read (projection pushdown). Specified as an Avro\n", + "schema, the requested projection is converted into a Parquet schema for Parquet\n", + "column projection.\n", + "

\n", + "This is useful if the full schema is large and you only want to read a few\n", + "columns, since it saves time by not reading unused columns.\n", + "

\n", + "If a requested projection is set, then the Avro schema used for reading\n", + "must be compatible with the projection. For instance, if a column is not included\n", + "in the projection then it must either not be included or be optional in the read\n", + "schema. Use {@link #setAvroReadSchema(org.apache.hadoop.mapreduce.Job,\n", + "org.apache.avro.Schema)} to set a read schema, if needed.\n", + "@param job a job\n", + "@param requestedProjection the requested projection schema\n", + "@see #setAvroReadSchema(org.apache.hadoop.mapreduce.Job, org.apache.avro.Schema)\n", + "@see org.apache.parquet.avro.AvroParquetOutputFormat#setSchema(org.apache.hadoop.mapreduce.Job, org.apache.avro.Schema)\n", + "\n", + "____java_8070_____\n", + "OBJECT OR MAP. CHECK THE TYPE ATTRIBUTE TO KNOW IT.\n", + "\n", + "____java_10760_____\n", + "Fires a PropertyChangeEvent:\n", + "

\n", + "\n", + "____java_13450_____\n", + "Force a numeric value to be in a specified range\n", + "Only defined for simple integers (ValueClass LONG)\n", + "WARNING: unsigned values are forced into the\n", + "signed size, but the proper bit pattern is maintained.\n", + "The term \"force\" means that if the value is outside the typed\n", + "min/max values, it is pegged to the min or max value depending\n", + "on the sign. Note that truncation is not used.\n", + "\n", + "@param basetype the type to force value to in range\n", + "@param value the value to force\n", + "@return forced value\n", + "@throws ConversionException if forcing is not possible\n", + "\n", + "____java_16140_____\n", + "Fetches entire contents of an InputStream and represent\n", + "same data as result InputStream.\n", + "

\n", + "This method is useful where,\n", + "

\n", + "It can be used in favor of {@link #toByteArray()}, since it\n", + "avoids unnecessary allocation and copy of byte[].
\n", + "This method buffers the input internally, so there is no need to use a\n", + "BufferedInputStream.\n", + "\n", + "@param input Stream to be fully buffered.\n", + "@return A fully buffered stream.\n", + "@throws IOException if an I/O error occurs\n", + "@since 2.0\n", + "\n", + "____java_18830_____\n", + "Returns all HTriggerInfo as a collection with the matches given job id.\n", + "\n", + "@param credentials auto fill by {@link RobeAuth} annotation for authentication.\n", + "@return all {@link HTriggerInfo} as a collection\n", + "\n", + "____java_21520_____\n", + "build a sheet for configuration map.\n", + "\n", + "@param sheet\n", + "sheet.\n", + "@param sheetConfigMap\n", + "sheetConfiguration map.\n", + "@param cellAttributesMap\n", + "the cell attributes map\n", + "\n", + "____java_24210_____\n", + "setter for occurrences - sets The occurrences of this variant.\n", + "@generated\n", + "@param v value to set into the feature\n", + "\n", + "____java_26900_____\n", + "Reads the map data from the stream. This method must be overridden if a\n", + "subclass must be setup before put() is used.\n", + "

\n", + "Serialization is not one of the JDK's nicest topics. Normal serialization\n", + "will initialise the superclass before the subclass. Sometimes however,\n", + "this isn't what you want, as in this case the put() method\n", + "on read can be affected by subclass state.\n", + "

\n", + "The solution adopted here is to deserialize the state data of this class\n", + "in this protected method. This method must be called by the\n", + "readObject() of the first serializable subclass.\n", + "

\n", + "Subclasses may override if the subclass has a specific field that must be\n", + "present before put() or calculateThreshold()\n", + "will work correctly.\n", + "\n", + "@param in\n", + "the input stream\n" + ] + } + ], + "source": [ + "print_10_docstrings('java')" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---Print docstrings for test code snippets in go\n", + "\n", + "____go_0_____\n", + "// mustWaitPinReady waits up to 3-second until connection is up (pin endpoint).\n", + "// Fatal on time-out.\n", + "\n", + "____go_1429_____\n", + "// New generator for creating a Buffalo Web application\n", + "\n", + "____go_2858_____\n", + "// DeleteOperation deletes (cancels) a running operation\n", + "\n", + "____go_4287_____\n", + "// Descendants returns a slice containing all descendants of a node, 'id',\n", + "// in d which are an ancestor of at least one of the nodes in 'to'.\n", + "\n", + "____go_5716_____\n", + "// Attr returns the value of the named attribute. nil is returned when the\n", + "// attribute is not set.\n", + "\n", + "____go_7145_____\n", + "// withDeadline is like context.WithDeadline, except it ignores the zero deadline.\n", + "\n", + "____go_8574_____\n", + "// MarshalJSON supports json.Marshaler interface\n", + "\n", + "____go_10003_____\n", + "// UnmarshalJSON supports json.Unmarshaler interface\n", + "\n", + "____go_11432_____\n", + "// PublicationLineageLocator builds a locator from the given href.\n", + "\n", + "____go_12861_____\n", + "// RoundTrip calls f(r).\n", + "\n", + "____go_14290_____\n", + "// GetInnkeeperClient -- get an innkeeper client and cache it in the object\n" + ] + } + ], + "source": [ + "print_10_docstrings('go')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example of queries-like from hidden data set" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1. Output to html file\n", + "2. How to determine if a string is a valid word\n", + "3. Convert int to string\n", + "4. Read JSON data\n", + "5. How to read .csv file in an efficient way?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}