Skip to content
This repository was archived by the owner on Jun 5, 2025. It is now read-only.

Commit 1090d43

Browse files
committed
On-the-fly secret scanning
This change introduces secrets on the fly scanning. It still requires * Response interception, e.g. outout from the LLM * Validation with an end to end IDE <> LLM flow
1 parent 94308e5 commit 1090d43

40 files changed

Lines changed: 1759 additions & 207 deletions

demo.py

Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
import copy
2+
from typing import Dict, List, NamedTuple
3+
4+
from codegate.pipeline.secrets.signatures import CodegateSignatures
5+
6+
7+
class SecretPosition(NamedTuple):
8+
"""Stores exact position and value of a secret for reliable restoration"""
9+
10+
line_number: int
11+
start_index: int
12+
end_index: int
13+
value: str
14+
15+
16+
class SecretManager:
17+
"""Manages finding, redacting and restoring secrets in text"""
18+
19+
def __init__(self, signatures_path: str = "signatures.yaml"):
20+
"""Initialize with path to signatures file"""
21+
CodegateSignatures.initialize(signatures_path)
22+
self._secret_store: Dict[str, List[SecretPosition]] = {}
23+
24+
def _generate_key(self, text: str) -> str:
25+
"""Generate unique key for text block to store its secrets"""
26+
return str(hash(text))
27+
28+
def find_and_redact(self, text: str) -> str:
29+
"""Find secrets in text and replace with REDACTED while preserving structure"""
30+
if not text:
31+
return text
32+
33+
# Get matches using CodegateSignatures
34+
matches = CodegateSignatures.find_in_string(text)
35+
if not matches:
36+
return text
37+
38+
# Convert text to lines for precise replacement
39+
lines = text.splitlines()
40+
41+
# Store original text key
42+
text_key = self._generate_key(text)
43+
self._secret_store[text_key] = []
44+
45+
# Create copy of lines to modify
46+
modified_lines = copy.deepcopy(lines)
47+
48+
# Process each match, store original value and replace with REDACTED
49+
for match in matches:
50+
# Store original value and position
51+
secret_pos = SecretPosition(
52+
line_number=match.line_number,
53+
start_index=match.start_index,
54+
end_index=match.end_index,
55+
value=match.value,
56+
)
57+
self._secret_store[text_key].append(secret_pos)
58+
59+
# Replace in the copied lines
60+
line = modified_lines[match.line_number - 1] # -1 since line numbers are 1-based
61+
modified_lines[match.line_number - 1] = (
62+
line[: match.start_index] + "REDACTED" + line[match.end_index :]
63+
)
64+
65+
# Reconstruct text with replacements
66+
return "\n".join(modified_lines)
67+
68+
def restore(self, redacted_text: str) -> str:
69+
"""Restore original secrets to redacted text"""
70+
if not redacted_text:
71+
return redacted_text
72+
73+
# Get stored secrets for this text
74+
text_key = self._generate_key(redacted_text)
75+
stored_secrets = self._secret_store.get(text_key)
76+
if not stored_secrets:
77+
return redacted_text
78+
79+
# Convert to lines for precise restoration
80+
lines = redacted_text.splitlines()
81+
82+
# Create copy of lines to modify
83+
restored_lines = copy.deepcopy(lines)
84+
85+
# Restore each secret
86+
for secret in stored_secrets:
87+
line = restored_lines[secret.line_number - 1]
88+
restored_lines[secret.line_number - 1] = (
89+
line[: secret.start_index] + secret.value + line[secret.end_index :]
90+
)
91+
92+
# Reconstruct text with restored values
93+
return "\n".join(restored_lines)
94+
95+
96+
def main():
97+
# Original text with secrets
98+
text = """from flask import Flask, request, jsonify
99+
import os
100+
import hashlib
101+
102+
GITHUB_TOKEN = "ghp_aBcDeFgHiJkLmNoPqRsTuVwXyZ0123456789"
103+
104+
app = Flask(__name__)
105+
106+
@app.route('/api/data', methods=['GET'])
107+
def get_data():
108+
# Insecure: No input validation
109+
AWS_ACCESS_KEY_ID = "AKIAIOSFODNN7EXAMPLE"
110+
111+
return {"data": "This is some insecure data!"}"""
112+
113+
# Create secret manager
114+
manager = SecretManager()
115+
116+
# Find and redact secrets
117+
print("Original text:")
118+
print("-" * 80)
119+
print(text)
120+
print("\nRedacted text:")
121+
print("-" * 80)
122+
redacted = manager.find_and_redact(text)
123+
print(redacted)
124+
125+
# Restore original secrets
126+
print("\nRestored text:")
127+
print("-" * 80)
128+
restored = manager.restore(redacted)
129+
print(restored)
130+
131+
# Verify restoration matches original
132+
print("\nVerification:")
133+
print("-" * 80)
134+
print(f"Restoration successful: {text == restored}")
135+
136+
137+
if __name__ == "__main__":
138+
main()

poetry.lock

Lines changed: 80 additions & 37 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ transformers = ">=4.46.3"
2020
litellm = "^1.52.16"
2121
llama_cpp_python = ">=0.3.2"
2222

23+
pycryptodome = "^3.21.0"
24+
cryptography = "^44.0.0"
2325
[tool.poetry.group.dev.dependencies]
2426
pytest = ">=7.4.0"
2527
pytest-cov = ">=4.1.0"

scripts/import_packages.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,7 @@ class PackageImporter:
1313
def __init__(self):
1414
self.client = weaviate.WeaviateClient(
1515
embedded_options=EmbeddedOptions(
16-
persistence_data_path="./weaviate_data",
17-
grpc_port=50052
16+
persistence_data_path="./weaviate_data", grpc_port=50052
1817
)
1918
)
2019
self.json_files = [
@@ -46,13 +45,13 @@ def generate_vector_string(self, package):
4645
"npm": "JavaScript package available on NPM",
4746
"go": "Go package",
4847
"crates": "Rust package available on Crates",
49-
"java": "Java package"
48+
"java": "Java package",
5049
}
5150
status_messages = {
5251
"archived": "However, this package is found to be archived and no longer maintained.",
5352
"deprecated": "However, this package is found to be deprecated and no longer "
5453
"recommended for use.",
55-
"malicious": "However, this package is found to be malicious."
54+
"malicious": "However, this package is found to be malicious.",
5655
}
5756
vector_str += f" is a {type_map.get(package['type'], 'unknown type')} "
5857
package_url = f"https://trustypkg.dev/{package['type']}/{package['name']}"
@@ -75,8 +74,9 @@ async def add_data(self):
7574
packages_dict = {
7675
f"{package.properties['name']}/{package.properties['type']}": {
7776
"status": package.properties["status"],
78-
"description": package.properties["description"]
79-
} for package in existing_packages
77+
"description": package.properties["description"],
78+
}
79+
for package in existing_packages
8080
}
8181

8282
for json_file in self.json_files:
@@ -85,12 +85,12 @@ async def add_data(self):
8585
packages_to_insert = []
8686
for line in f:
8787
package = json.loads(line)
88-
package["status"] = json_file.split('/')[-1].split('.')[0]
88+
package["status"] = json_file.split("/")[-1].split(".")[0]
8989
key = f"{package['name']}/{package['type']}"
9090

9191
if key in packages_dict and packages_dict[key] == {
9292
"status": package["status"],
93-
"description": package["description"]
93+
"description": package["description"],
9494
}:
9595
print("Package already exists", key)
9696
continue
@@ -102,8 +102,9 @@ async def add_data(self):
102102
# Synchronous batch insert after preparing all data
103103
with collection.batch.dynamic() as batch:
104104
for package, vector in packages_to_insert:
105-
batch.add_object(properties=package, vector=vector,
106-
uuid=generate_uuid5(package))
105+
batch.add_object(
106+
properties=package, vector=vector, uuid=generate_uuid5(package)
107+
)
107108

108109
async def run_import(self):
109110
self.setup_schema()

0 commit comments

Comments
 (0)