Skip to content

Commit

Permalink
Prepare for release.
Browse files Browse the repository at this point in the history
  • Loading branch information
ljuti committed May 10, 2023
1 parent 816bbbf commit a277664
Show file tree
Hide file tree
Showing 9 changed files with 192 additions and 32 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
/pkg/
/spec/reports/
/tmp/
/examples/**/*.db

# rspec failure tracking
.rspec_status
Expand Down
2 changes: 1 addition & 1 deletion Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ gem "rake", "~> 13.0"
gem "rspec", "~> 3.0"
gem "standard", "~> 1.3"

eval File.read("Gemfile.local") if File.exist?("Gemfile.local")
eval_gemfile "Gemfile.local" if File.exist?("Gemfile.local")
16 changes: 16 additions & 0 deletions lib/roseflow/text/completion.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# frozen_string_literal: true

module Roseflow
module Text
class Completion
def initialize(input)
@input = input
end

# Creates a new completion for the given input.
def call(model:, prompt:, **options)
provider.completions(model: model, prompt: @input, **options).choices
end
end
end
end
2 changes: 2 additions & 0 deletions lib/roseflow/types.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,6 @@

module Types
include Dry.Types()

Number = Types::Float | Types::Integer
end
136 changes: 111 additions & 25 deletions lib/roseflow/vector_stores/hnsw_memory_store.rb
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,23 @@ module Roseflow
module VectorStores
UnsupportedSimilarityMetricError = Class.new(StandardError)

# HNSWMemoryStore is an in-memory vector store that implements
# the HNSW algorithm.
class HNSWMemoryStore < Base
PROBABILITY_FACTORS = [
0.5,
1 / Math::E
1 / Math::E,
].freeze

# Initializes a new HNSWMemoryStore with the specified
# similarity metric, dimensions, m and ef.
#
# @param similarity_metric [Symbol] the similarity metric to use
# @param dimensions [Integer] the number of dimensions of the vectors
# @param m [Integer] the number of neighbors to consider when adding a node
# @param ef [Integer] the number of neighbors to consider when searching
# @raise [UnsupportedSimilarityMetricError] if the similarity metric is not supported
# @return [HNSWMemoryStore] the new HNSWMemoryStore
def initialize(similarity_metric, dimensions, m, ef)
@similarity_metric = similarity_metric
@dimensions = dimensions
Expand All @@ -27,6 +38,11 @@ def initialize(similarity_metric, dimensions, m, ef)

attr_accessor :nodes

# Adds a new node to the vector store.
#
# @param node_id [String] the ID of the node
# @param vector [Array<Float>] the vector of the node
# @return [HNSWNode] the new node
def add_node(node_id, vector)
level = get_random_level
node = HNSWNode.new(node_id, vector, level, @m)
Expand All @@ -51,14 +67,28 @@ def add_node(node_id, vector)
@nodes[node_id] = node
end

alias_method :create_vector, :add_node

# Deletes a node from the vector store.
#
# @param node_id [String] the ID of the node
# @return [HNSWNode] the deleted node
def delete_node(node_id)
@nodes.delete(node_id)
end

alias_method :delete_vector, :delete_node

# Finds a node in the vector store.
#
# @param node_id [String] the ID of the node
# @return [HNSWNode] the found node
# @return [nil] if the node was not found
def find(node_id)
@nodes[node_id]
end

# Serializes the vector store to a binary string.
def serialize
graph = HNSWGraph.new(
entrypoint_id: @entrypoint.id,
Expand All @@ -72,14 +102,15 @@ def serialize
id: node.id,
vector: node.vector,
level: node.level,
neighbors: node.neighbors.flatten.compact.map(&:id)
neighbors: node.neighbors.flatten.compact.map(&:id),
)
end
end,
)

graph.to_proto
end

# Deserializes a binary string into a vector store.
def self.deserialize(serialized_data)
graph = HNSWGraph.decode(serialized_data)

Expand All @@ -106,10 +137,12 @@ def self.deserialize(serialized_data)
hnsw
end

# Finds the nearest neighbors of a vector.
def find_neighbors(node, query, level)
search_knn(node, query, @m, level)
end

# Updates the neighbors of a node.
def update_neighbors(node, neighbors, query, level)
node.neighbors[level] = neighbors[0, @m]

Expand All @@ -123,10 +156,12 @@ def update_neighbors(node, neighbors, query, level)
end
end

# Updates maximum level of the graph.
def update_max_level(level)
@max_level = level if level > @max_level
end

# Finds the k nearest neighbors of a vector.
def nearest_neighbors(query, k)
return [] unless @entrypoint
entry_point = @entrypoint
Expand Down Expand Up @@ -170,42 +205,65 @@ def find_closest_neighbor(query, neighbors)
[closest_neighbor, closest_distance]
end

# Finds the k nearest neighbors of a vector.
def search_knn(entry_point, query, k, level)
visited = Set.new
candidates = Set.new([entry_point])
result = []

while candidates.size > 0
closest = candidates.min_by { |c| distance(query, c.vector) }
closest = find_closest_candidate(candidates, query)
candidates.delete(closest)
visited.add(closest.id)

if result.size < k
result.push(closest)
else
furthest_result = result.max_by { |r| distance(query, r.vector) }
result = update_result(result, closest, query, k)

closest_distance = distance(query, closest.vector)
furthest_result_distance = distance(query, furthest_result.vector)
break if termination_condition_met?(result, closest, query, k)

if closest_distance < furthest_result_distance
result.delete(furthest_result)
result.push(closest)
else
break
end
end
add_neighbors_to_candidates(closest, level, visited, candidates)
end

result
end

def find_closest_candidate(candidates, query)
candidates.min_by { |c| distance(query, c.vector) }
end

closest.neighbors[level].each do |neighbor|
next unless neighbor
next if visited.include?(neighbor.id)
candidates.add(neighbor)
def update_result(result, candidate, query, k)
if result.size < k
result.push(candidate)
else
furthest_result = result.max_by { |r| distance(query, r.vector) }
closest_distance = distance(query, candidate.vector)
furthest_result_distance = distance(query, furthest_result.vector)

if closest_distance < furthest_result_distance
result.delete(furthest_result)
result.push(candidate)
end
end

result
end

def termination_condition_met?(result, closest, query, k)
return false if result.size < k

furthest_result_distance = distance(query, result.max_by { |r| distance(query, r.vector) }.vector)
closest_distance = distance(query, closest.vector)

closest_distance >= furthest_result_distance
end

def add_neighbors_to_candidates(closest, level, visited, candidates)
closest.neighbors[level].each do |neighbor|
next unless neighbor
next if visited.include?(neighbor.id)
candidates.add(neighbor)
end
end

# Calculates the distance between two vectors.
def distance(from, to)
case @similarity_metric.to_sym
when :euclidean
Expand All @@ -217,29 +275,32 @@ def distance(from, to)
end
end

# Calculates the euclidean distance between two vectors.
def euclidean_distance(from, to)
e_distance = 0
from.each_with_index do |value, index|
e_distance += (value - to[index])**2
e_distance += (value - to[index]) ** 2
end

Math.sqrt(e_distance)
end

# Calculates the cosine distance between two vectors.
def cosine_distance(from, to)
dot_product = 0
norm_from = 0
norm_to = 0

from.each_with_index do |value, index|
dot_product += value * to[index]
norm_from += value**2
norm_to += to[index]**2
norm_from += value ** 2
norm_to += to[index] ** 2
end

1 - (dot_product / (Math.sqrt(norm_from) * Math.sqrt(norm_to)))
end

# Returns a random level for a node.
def get_random_level
level = 0
while rand < PROBABILITY_FACTORS[0] && level < @max_level
Expand All @@ -248,10 +309,18 @@ def get_random_level
level
end

# HNSW vector store node.
class HNSWNode
attr_reader :id, :vector
attr_accessor :level, :neighbors

# Initializes a new node.
#
# @param id [String] the node ID (ULID)
# @param vector [Array] the node vector
# @param level [Integer] the node level
# @param m [Integer] the number of neighbors
# @return [HNSWNode] the node
def initialize(id, vector, level, m)
@id = id
@vector = vector
Expand All @@ -260,6 +329,9 @@ def initialize(id, vector, level, m)
end
end

# BoundedPriorityQueue is a data structure that keeps a priority queue
# of a bounded size. It maintains the top-k elements with the smallest
# priorities. It uses an underlying PriorityQueue to store elements.
class BoundedPriorityQueue
def initialize(max_size)
@max_size = max_size
Expand All @@ -270,6 +342,10 @@ def size
@queue.size
end

# Inserts an item into the BoundedPriorityQueue. If the queue is full
# and the new item has a smaller priority than the item with the
# highest priority, the highest priority item is removed and the new
# item is added.
def push(item)
if size < @max_size
@queue.push(item)
Expand All @@ -279,6 +355,8 @@ def push(item)
end
end

# Returns the item with the smallest priority without removing it from
# the BoundedPriorityQueue.
def peek
@queue.peek
end
Expand All @@ -288,6 +366,10 @@ def to_a
end
end

# PriorityQueue is a data structure that keeps elements ordered by priority.
# It supports inserting elements, removing the element with the smallest
# priority, and peeking at the element with the smallest priority. It uses
# a binary heap as the underlying data structure.
class PriorityQueue
def initialize
@elements = []
Expand All @@ -306,6 +388,8 @@ def push(item)
shift_up(@elements.size - 1)
end

# Removes and returns the element with the smallest priority.
# Returns nil if the PriorityQueue is empty.
def pop
return if empty?

Expand All @@ -315,6 +399,8 @@ def pop
element
end

# Returns the element with the smallest priority without removing it
# from the PriorityQueue.
def peek
@elements.first
end
Expand Down
38 changes: 38 additions & 0 deletions lib/roseflow/vector_stores/type/vector.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# frozen_string_literal: true

require "active_model/type"

module Roseflow
module VectorStores
module Type
class Vector < ActiveModel::Type::Value
def initialize(dimensions:, model:, attribute_name:)
super()
@dimensions = dimensions
@model = model
@attribute_name = attribute_name
end

def self.cast(value, dimensions:)
value = value.to_a.map(&:to_f)

raise Error, "Values must be finite" unless value.all?(&:finite?)

value
end

def cast(value)
self.class.cast(value, dimensions: @dimensions) unless value.nil?
end

def serialize(value)
raise NotImplementedError
end

def deserialize(value)
raise NotImplementedError
end
end
end
end
end
Loading

0 comments on commit a277664

Please sign in to comment.