|
| 1 | +# This script indexes the classes and modules within a set of files using the |
| 2 | +# saved source functionality. |
| 3 | + |
| 4 | +require "prism" |
| 5 | +require "etc" |
| 6 | +require "tempfile" |
| 7 | + |
| 8 | +module Indexer |
| 9 | + # A class that implements the #enter functionality so that it can be passed to |
| 10 | + # the various save* APIs. This effectively bundles up all of the node_id and |
| 11 | + # field_name pairs so that they can be written back to the parent process. |
| 12 | + class Repository |
| 13 | + attr_reader :scope, :entries |
| 14 | + |
| 15 | + def initialize |
| 16 | + @scope = [] |
| 17 | + @entries = [] |
| 18 | + end |
| 19 | + |
| 20 | + def with(next_scope) |
| 21 | + previous_scope = scope |
| 22 | + @scope = scope + next_scope |
| 23 | + yield |
| 24 | + @scope = previous_scope |
| 25 | + end |
| 26 | + |
| 27 | + def empty? |
| 28 | + entries.empty? |
| 29 | + end |
| 30 | + |
| 31 | + def enter(node_id, field_name) |
| 32 | + entries << [scope.join("::"), node_id, field_name] |
| 33 | + end |
| 34 | + end |
| 35 | + |
| 36 | + # Visit the classes and modules in the AST and save their locations into the |
| 37 | + # repository. |
| 38 | + class Visitor < Prism::Visitor |
| 39 | + attr_reader :repository |
| 40 | + |
| 41 | + def initialize(repository) |
| 42 | + @repository = repository |
| 43 | + end |
| 44 | + |
| 45 | + def visit_class_node(node) |
| 46 | + repository.with(node.constant_path.full_name_parts) do |
| 47 | + node.constant_path.save_location(repository) |
| 48 | + visit(node.body) |
| 49 | + end |
| 50 | + end |
| 51 | + |
| 52 | + def visit_module_node(node) |
| 53 | + repository.with(node.constant_path.full_name_parts) do |
| 54 | + node.constant_path.save_location(repository) |
| 55 | + visit(node.body) |
| 56 | + end |
| 57 | + end |
| 58 | + end |
| 59 | + |
| 60 | + # Index the classes and modules within a file. If there are any entries, |
| 61 | + # return them as a serialized string to the parent process. |
| 62 | + def self.index(filepath) |
| 63 | + repository = Repository.new |
| 64 | + Prism.parse_file(filepath).value.accept(Visitor.new(repository)) |
| 65 | + "#{filepath}|#{repository.entries.join("|")}" unless repository.empty? |
| 66 | + end |
| 67 | +end |
| 68 | + |
| 69 | +def index_glob(glob, count = Etc.nprocessors - 1) |
| 70 | + process_ids = [] |
| 71 | + filepath_writers = [] |
| 72 | + index_reader, index_writer = IO.pipe |
| 73 | + |
| 74 | + # For each number in count, fork off a worker that has access to two pipes. |
| 75 | + # The first pipe is the index_writer, to which it writes all of the results of |
| 76 | + # indexing the various files. The second pipe is the filepath_reader, from |
| 77 | + # which it reads the filepaths that it needs to index. |
| 78 | + count.times do |
| 79 | + filepath_reader, filepath_writer = IO.pipe |
| 80 | + |
| 81 | + process_ids << fork do |
| 82 | + filepath_writer.close |
| 83 | + index_reader.close |
| 84 | + |
| 85 | + while (filepath = filepath_reader.gets(chomp: true)) |
| 86 | + results = Indexer.index(filepath) |
| 87 | + index_writer.puts(results) if results |
| 88 | + end |
| 89 | + end |
| 90 | + |
| 91 | + filepath_reader.close |
| 92 | + filepath_writers << filepath_writer |
| 93 | + end |
| 94 | + |
| 95 | + index_writer.close |
| 96 | + |
| 97 | + # In a separate thread, write all of the filepaths to the various worker |
| 98 | + # processes. This is done in a separate threads since puts will eventually |
| 99 | + # block when each of the pipe buffers fills up. We write in a round-robin |
| 100 | + # fashion to the various workers. This could be improved using a work-stealing |
| 101 | + # algorithm, but is fine if you don't end up having a ton of variety in the |
| 102 | + # size of your files. |
| 103 | + writer_thread = |
| 104 | + Thread.new do |
| 105 | + Dir[glob].each_with_index do |filepath, index| |
| 106 | + filepath_writers[index % count].puts(filepath) |
| 107 | + end |
| 108 | + end |
| 109 | + |
| 110 | + index = Hash.new { |hash, key| hash[key] = [] } |
| 111 | + |
| 112 | + # In a separate thread, read all of the results from the various worker |
| 113 | + # processes and store them in the index. This is done in a separate thread so |
| 114 | + # that reads and writes can be interleaved. This is important so that the |
| 115 | + # index pipe doesn't fill up and block the writer. |
| 116 | + reader_thread = |
| 117 | + Thread.new do |
| 118 | + while (line = index_reader.gets(chomp: true)) |
| 119 | + filepath, *entries = line.split("|") |
| 120 | + repository = Prism::Relocation.filepath(filepath).filepath.lines.code_unit_columns(Encoding::UTF_16LE).leading_comments |
| 121 | + |
| 122 | + entries.each_slice(3) do |(name, node_id, field_name)| |
| 123 | + index[name] << repository.enter(Integer(node_id), field_name.to_sym) |
| 124 | + end |
| 125 | + end |
| 126 | + end |
| 127 | + |
| 128 | + writer_thread.join |
| 129 | + filepath_writers.each(&:close) |
| 130 | + |
| 131 | + reader_thread.join |
| 132 | + index_reader.close |
| 133 | + |
| 134 | + process_ids.each { |process_id| Process.wait(process_id) } |
| 135 | + index |
| 136 | +end |
| 137 | + |
| 138 | +index_glob(File.expand_path("../../lib/**/*.rb", __dir__)) |
0 commit comments