Skip to content

Commit 0d8979a

Browse files
committed
wip: add starencoder model to generate embeddings
1 parent 20f7a54 commit 0d8979a

File tree

12 files changed

+1454
-111
lines changed

12 files changed

+1454
-111
lines changed

Cargo.lock

Lines changed: 624 additions & 24 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/gitignore/Cargo.toml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
[package]
2+
name = "gitignore"
3+
version = "0.1.0"
4+
edition.workspace = true
5+
license.workspace = true
6+
authors.workspace = true
7+
8+
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
9+
10+
[dependencies]
11+
glob = "0.3"
12+
thiserror = "1"
13+
14+
[dev-dependencies]
15+
tempdir = "0.3"

crates/gitignore/negate_glob_pattern

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
!crates/**/Cargo.toml

crates/gitignore/negate_pattern

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
!Cargo.toml
2+
Cargo.toml

crates/gitignore/regular_pattern

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Cargo.toml

crates/gitignore/relative_path

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
crates/

crates/gitignore/src/lib.rs

Lines changed: 212 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,212 @@
1+
use std::{
2+
fmt::Debug,
3+
fs::{canonicalize, File},
4+
io::{BufRead, BufReader},
5+
path::{Path, PathBuf},
6+
};
7+
8+
use glob::{MatchOptions, Pattern};
9+
10+
#[derive(thiserror::Error, Debug)]
11+
pub enum Error {
12+
#[error("io error: {0}")]
13+
Io(#[from] std::io::Error),
14+
#[error("non utf8 path")]
15+
NonUtf8Path,
16+
#[error("glob pattern error: {0}")]
17+
Pattern(#[from] glob::PatternError),
18+
}
19+
20+
pub type Result<T> = std::result::Result<T, Error>;
21+
22+
#[derive(Debug)]
23+
pub struct Rule {
24+
negate: bool,
25+
pattern: Pattern,
26+
_source_line: usize,
27+
}
28+
29+
impl Rule {
30+
pub fn parse(
31+
mut pattern: String,
32+
base_path: impl AsRef<Path>,
33+
_source_line: usize,
34+
) -> Result<Option<Self>> {
35+
if pattern.trim().is_empty() || pattern.starts_with('#') {
36+
return Ok(None);
37+
}
38+
let negate = if pattern.starts_with('!') {
39+
pattern.remove(0);
40+
true
41+
} else {
42+
false
43+
};
44+
let directory = if pattern.ends_with('/') {
45+
pattern.pop();
46+
true
47+
} else {
48+
false
49+
};
50+
let anchored = pattern.contains('/');
51+
let pattern = if anchored {
52+
let base = format!("{}/{pattern}", base_path.as_ref().to_str().unwrap());
53+
if directory {
54+
format!("{base}/**")
55+
} else {
56+
base
57+
}
58+
} else if !pattern.starts_with("**") {
59+
let base = format!("**/{pattern}");
60+
if directory {
61+
format!("{base}/**")
62+
} else {
63+
base
64+
}
65+
} else {
66+
pattern
67+
};
68+
Ok(Some(Self {
69+
negate,
70+
pattern: Pattern::new(&pattern)?,
71+
_source_line,
72+
}))
73+
}
74+
}
75+
76+
#[derive(Debug)]
77+
pub struct Gitignore {
78+
rules: Vec<Rule>,
79+
_source_file: PathBuf,
80+
}
81+
82+
impl Gitignore {
83+
/// Parses a `.gitignore` file at `path`.
84+
///
85+
/// If `path` is a directory, attempts to read `{dir}/.gitignore`.
86+
pub fn parse(path: impl AsRef<Path>) -> Result<Self> {
87+
let mut path = canonicalize(path)?;
88+
if path.is_dir() {
89+
path = path.join(".gitignore");
90+
}
91+
let reader = BufReader::new(File::open(&path)?);
92+
let mut rules = Vec::new();
93+
for (line_nb, line) in reader.lines().enumerate() {
94+
let line = line?;
95+
if let Some(rule) = Rule::parse(line, path.parent().unwrap(), line_nb + 1)? {
96+
rules.push(rule);
97+
}
98+
}
99+
Ok(Self {
100+
rules,
101+
_source_file: path,
102+
})
103+
}
104+
105+
pub fn ignored(&self, path: impl AsRef<Path>) -> Result<bool> {
106+
let path = canonicalize(path)?;
107+
let match_opts = MatchOptions {
108+
case_sensitive: true,
109+
require_literal_separator: true,
110+
require_literal_leading_dot: false,
111+
};
112+
for rule in &self.rules {
113+
println!("matching {} to {rule:?}", path.to_str().unwrap());
114+
let path_str = path.to_str().ok_or(Error::NonUtf8Path)?;
115+
let to_match = if path.is_dir() {
116+
format!("{path_str}/")
117+
} else {
118+
path_str.to_owned()
119+
};
120+
if rule.pattern.matches_with(&to_match, match_opts) {
121+
return Ok(!rule.negate);
122+
}
123+
}
124+
Ok(false)
125+
}
126+
}
127+
128+
#[cfg(test)]
129+
mod tests {
130+
use std::sync::Once;
131+
132+
use super::*;
133+
134+
static INIT: Once = Once::new();
135+
136+
fn create_gitignore(rules: &str, name: &str) -> Gitignore {
137+
INIT.call_once(|| {
138+
std::env::set_current_dir(canonicalize("../..").unwrap()).unwrap();
139+
});
140+
std::fs::write(name, rules).unwrap();
141+
let gitignore = Gitignore::parse(name).unwrap();
142+
std::fs::remove_file(name).unwrap();
143+
gitignore
144+
}
145+
146+
#[test]
147+
fn test_regular_pattern() {
148+
let gitignore = create_gitignore("Cargo.toml", "regular_pattern");
149+
assert!(gitignore.ignored("Cargo.toml").unwrap());
150+
assert!(!gitignore.ignored("LICENSE").unwrap());
151+
}
152+
153+
#[test]
154+
fn test_glob_pattern() {
155+
let gitignore = create_gitignore("crates/**/Cargo.toml", "glob_pattern");
156+
assert!(gitignore.ignored("crates/gitignore/Cargo.toml").unwrap());
157+
assert!(gitignore.ignored("crates/llm-ls/Cargo.toml").unwrap());
158+
assert!(gitignore.ignored("crates/lsp-client/Cargo.toml").unwrap());
159+
assert!(gitignore.ignored("crates/mock_server/Cargo.toml").unwrap());
160+
assert!(gitignore.ignored("crates/testbed/Cargo.toml").unwrap());
161+
assert!(!gitignore.ignored("crates/llm-ls/src/main.rs").unwrap());
162+
assert!(!gitignore.ignored("crates/lsp-client/src/lib.rs").unwrap());
163+
assert!(!gitignore.ignored("crates/testbed/src/main.rs").unwrap());
164+
}
165+
166+
#[test]
167+
fn test_negate_glob_pattern() {
168+
let gitignore = create_gitignore("!crates/**/Cargo.toml", "negate_glob_pattern");
169+
assert!(!gitignore.ignored("crates/gitignore/Cargo.toml").unwrap());
170+
assert!(!gitignore.ignored("crates/llm-ls/Cargo.toml").unwrap());
171+
assert!(!gitignore.ignored("crates/lsp-client/Cargo.toml").unwrap());
172+
assert!(!gitignore.ignored("crates/mock_server/Cargo.toml").unwrap());
173+
assert!(!gitignore.ignored("crates/testbed/Cargo.toml").unwrap());
174+
assert!(!gitignore.ignored("crates/llm-ls/src/main.rs").unwrap());
175+
assert!(!gitignore.ignored("crates/lsp-client/src/lib.rs").unwrap());
176+
assert!(!gitignore.ignored("crates/testbed/src/main.rs").unwrap());
177+
}
178+
179+
#[test]
180+
fn test_start_glob_pattern() {
181+
let gitignore = create_gitignore("**/crates/", "start_glob_pattern");
182+
assert!(gitignore.ignored("crates/").unwrap());
183+
assert!(gitignore.ignored("crates/llm-ls/Cargo.toml").unwrap());
184+
assert!(gitignore
185+
.ignored("crates/testbed/repositories/simple/src/main.rs")
186+
.unwrap());
187+
assert!(!gitignore.ignored("xtask/").unwrap());
188+
assert!(!gitignore.ignored("README.md").unwrap());
189+
}
190+
191+
#[test]
192+
fn test_relative_path() {
193+
let gitignore = create_gitignore("crates/", "relative_path");
194+
assert!(gitignore.ignored("crates/").unwrap());
195+
assert!(gitignore.ignored("crates/llm-ls/Cargo.toml").unwrap());
196+
assert!(gitignore
197+
.ignored("crates/testbed/repositories/simple/src/main.rs")
198+
.unwrap());
199+
assert!(!gitignore.ignored("xtask/").unwrap());
200+
assert!(!gitignore.ignored("README.md").unwrap());
201+
}
202+
203+
#[test]
204+
fn test_negate_pattern() {
205+
let gitignore = create_gitignore(
206+
"!Cargo.toml\n\
207+
Cargo.toml",
208+
"negate_pattern",
209+
);
210+
assert!(!gitignore.ignored("Cargo.toml").unwrap());
211+
}
212+
}

crates/gitignore/start_glob_pattern

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
**/crates

crates/llm-ls/Cargo.toml

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,12 @@ name = "llm-ls"
99
[dependencies]
1010
arrow-array = "47"
1111
arrow-schema = "47"
12+
candle = { version = "0.3", package = "candle-core", default-features = false }
13+
candle-nn = "0.3"
14+
candle-transformers = "0.3"
15+
futures-util = "0.3"
16+
gitignore = { path = "../gitignore" }
17+
hf-hub = { version = "0.3", features = ["tokio"] }
1218
home = "0.5"
1319
ropey = "1.6"
1420
reqwest = { version = "0.11", default-features = false, features = [
@@ -17,7 +23,8 @@ reqwest = { version = "0.11", default-features = false, features = [
1723
] }
1824
serde = { version = "1", features = ["derive"] }
1925
serde_json = "1"
20-
tokenizers = { version = "0.14", default-features = false, features = ["onig"] }
26+
thiserror = "1"
27+
tokenizers = { version = "0.15", default-features = false, features = ["onig"] }
2128
tokio = { version = "1", features = [
2229
"fs",
2330
"io-std",
@@ -34,8 +41,9 @@ tree-sitter-bash = "0.20"
3441
tree-sitter-c = "0.20"
3542
tree-sitter-cpp = "0.20"
3643
tree-sitter-c-sharp = "0.20"
44+
tree-sitter-css = "0.20"
3745
tree-sitter-elixir = "0.1"
38-
tree-sitter-erlang = "0.2"
46+
tree-sitter-erlang = "0.3"
3947
tree-sitter-go = "0.20"
4048
tree-sitter-html = "0.19"
4149
tree-sitter-java = "0.20"
@@ -44,6 +52,7 @@ tree-sitter-json = "0.20"
4452
tree-sitter-lua = "0.0.19"
4553
tree-sitter-md = "0.1"
4654
tree-sitter-objc = "3"
55+
tree-sitter-php = "0.20.0"
4756
tree-sitter-python = "0.20"
4857
tree-sitter-r = "0.19"
4958
tree-sitter-ruby = "0.20"

crates/llm-ls/src/error.rs

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
use std::fmt::Display;
2+
3+
use hf_hub::api::tokio::ApiError;
4+
use tower_lsp::jsonrpc::Error as LspError;
5+
use tracing::error;
6+
7+
use crate::APIError;
8+
9+
pub fn internal_error<E: Display>(err: E) -> LspError {
10+
let err_msg = err.to_string();
11+
error!(err_msg);
12+
LspError {
13+
code: tower_lsp::jsonrpc::ErrorCode::InternalError,
14+
message: err_msg.into(),
15+
data: None,
16+
}
17+
}
18+
19+
#[derive(thiserror::Error, Debug)]
20+
pub enum Error {
21+
#[error("backend api error: {0}")]
22+
Api(#[from] APIError),
23+
#[error("arrow error: {0}")]
24+
Arrow(#[from] arrow_schema::ArrowError),
25+
#[error("candle error: {0}")]
26+
Candle(#[from] candle::Error),
27+
#[error("gitignore error: {0}")]
28+
Gitignore(#[from] gitignore::Error),
29+
#[error("hugging face api error: {0}")]
30+
HfApi(#[from] ApiError),
31+
#[error("http error: {0}")]
32+
Http(#[from] reqwest::Error),
33+
#[error("io error: {0}")]
34+
Io(#[from] std::io::Error),
35+
#[error("invalid header value: {0}")]
36+
InvalidHeaderValue(#[from] reqwest::header::InvalidHeaderValue),
37+
#[error("invalid repository id")]
38+
InvalidRepositoryId,
39+
#[error("invalid tokenizer path")]
40+
InvalidTokenizerPath,
41+
#[error("index out of bounds: {0}")]
42+
OutOfBoundIndexing(usize),
43+
#[error("line out of bounds: {0}")]
44+
OutOfBoundLine(usize),
45+
#[error("slice out of bounds: {0}..{1}")]
46+
OutOfBoundSlice(usize, usize),
47+
#[error("rope error: {0}")]
48+
Rope(#[from] ropey::Error),
49+
#[error("serde json error: {0}")]
50+
SerdeJson(#[from] serde_json::Error),
51+
#[error("tokenizer error: {0}")]
52+
Tokenizer(#[from] tokenizers::Error),
53+
#[error("tokio join error: {0}")]
54+
TokioJoin(#[from] tokio::task::JoinError),
55+
#[error("vector db error: {0}")]
56+
VectorDb(#[from] vectordb::error::Error),
57+
}
58+
59+
pub type Result<T> = std::result::Result<T, Error>;
60+
61+
impl From<Error> for LspError {
62+
fn from(err: Error) -> Self {
63+
internal_error(err)
64+
}
65+
}

0 commit comments

Comments
 (0)