Initial implementation of slsets egg

This implements sets of symbols *as lists*, like in the srfi-1 lset operations. It maintains the benefit of using lists for everything, but not having to pay the quadratic performance penalty that srfi-1 entails. We achieve this by using plist operations to "mark" items so we have to visit the list "set" only a fixed number of times.
author: Peter Bex <peter@more-magic.net> 2025-07-29 14:52:12 +0200
committer: Peter Bex <peter@more-magic.net> 2025-08-05 09:41:44 +0200
commit: 84a9205579a6d069a40d23b4bc815bc1a50b8181 (patch)
tree: 82a7986e4f60a5d222ff0ceefbe6f199c57843e5
download: slsets-84a9205579a6d069a40d23b4bc815bc1a50b8181.tar.gz
4 files changed, 373 insertions, 0 deletions
diff --git a/benchmarks/benchmark.scm b/benchmarks/benchmark.scm
new file mode 100644
index 0000000..21d3c43
--- /dev/null
+++ b/benchmarks/benchmark.scm
@@ -0,0 +1,110 @@
+;;
+;; Simple benchmarking program to compare set-difference on hash
+;; tables, lsets and slsets.
+;;
+;; Note that the tests for slsets is severely disadvantaged because
+;; the hash tables are created with the size of the set, which
+;; normally would not be the case because you can't predict ahead of
+;; time how big it would get.  Also, the symbols have a short string
+;; value which makes hashing fast.  Slsets don't care about the symbol
+;; size and don't need a preallocated size.
+
+(import (chicken fixnum) (chicken string) (chicken process-context) srfi-1 srfi-69 slsets)
+
+(unless (= (length (command-line-arguments)) 2)
+  (error "Need impl name (one of lset, slset, symbolset or srfi-69) and set size"))
+
+(define impl (string->symbol (car (command-line-arguments))))
+(define size (string->number (cadr (command-line-arguments))))
+
+(define-inline (list-tabulate n fun)
+  (let lp ((i 0)
+           (res '()))
+    (if (= i n)
+        res
+        (lp (add1 i)
+            (cons (fun i) res)))))
+
+(define lst (list-tabulate size (lambda (i) (string->symbol (conc "x" i)))))
+(define lst (list-tabulate size (lambda (i) (string->symbol (conc "y" i)))))
+
+(define-inline (filter pred lst)
+  (let lp ((lst lst)
+           (res '()))
+    (cond ((null? lst) (reverse res))
+          ((not (pred (car lst))) (lp (cdr lst) res))
+          (else (lp (cdr lst) (cons (car lst) res))))))
+
+;;;;;;;;;;;;
+;;; Quick and dirty hash table as used in CHICKEN internally
+;;;;;;;;;;;;
+
+(define hash-symbol
+  (let ((cache-s #f)
+	(cache-h #f)
+	;; NOTE: All low-level hash tables share the same randomization factor
+	(rand (##core#inline "C_rand" #x10000)))
+    (lambda (s n)
+      (if (eq? s cache-s)
+	  (##core#inline "C_fixnum_modulo" cache-h n)
+	  (let ((bv (##sys#slot s 1)))
+	    (set! cache-s s)
+	    (set! cache-h (##core#inline "C_u_i_bytevector_hash" bv 0 (fx- (##sys#size bv) 1) rand))
+	    (##core#inline "C_fixnum_modulo" cache-h n))))))
+
+(define (make-my-hash-table #!optional (size 301))
+  (make-vector size '()))
+
+(define (my-hash-table-ref ht key)
+  (let loop ((bucket (##sys#slot ht (hash-symbol key (##sys#size ht)))))
+    (and (not (eq? '() bucket))
+	 (if (eq? key (##sys#slot (##sys#slot bucket 0) 0))
+	     (##sys#slot (##sys#slot bucket 0) 1)
+	     (loop (##sys#slot bucket 1))))))
+
+(define (my-hash-table-set! ht key val)
+  (let* ((k (hash-symbol key (##sys#size ht)))
+	 (ib (##sys#slot ht k)))
+      (let loop ((bucket ib))
+	(if (eq? '() bucket)
+	    (##sys#setslot ht k (cons (cons key val) ib))
+	    (if (eq? key (##sys#slot (##sys#slot bucket 0) 0))
+		(##sys#setslot (##sys#slot bucket 0) 1 val)
+		(loop (##sys#slot bucket 1)))))))
+
+(define (symbolset-difference ls . lss)
+  (let ((seen (make-my-hash-table size)))
+    (for-each (lambda (lst)
+                (for-each (lambda (x) (my-hash-table-set! seen x #t)) lst))
+              lss)
+    (filter (lambda (x) (not (my-hash-table-ref seen x))) ls)))
+
+;;;;;;;;;;;;
+
+;; For comparison, a bit strange
+(define (srfi-69-hash-table-difference ht1 ht2)
+  (let ((result (hash-table-copy ht1)))
+    (hash-table-walk ht2 (lambda (k v)
+                           (hash-table-delete! result k)))
+    result))
+
+;;;;;;;;;;;;
+
+(let lp ((i 10000))
+  (case impl
+    ((lset)
+     (lset-difference eq? lst (cons 'aha lst)))
+    ((slset)
+     (slset-difference lst (cons 'aha lst)))
+    ((symbolset)
+     (symbolset-difference lst (cons 'aha lst)))
+    ((srfi-69)
+     (when (pair? lst)
+       (let ((ht (make-hash-table eq? symbol-hash size)))
+         (begin 
+           (for-each (lambda (x) (hash-table-set! ht x #t)) lst)
+           (set! lst ht))))
+     (srfi-69-hash-table-difference lst lst))
+    (else (error "Unknown impl" impl)))
+  (unless (zero? i)
+    (lp (sub1 i))))
diff --git a/slsets.egg b/slsets.egg
new file mode 100644
index 0000000..fd2b6f3
--- /dev/null
+++ b/slsets.egg
@@ -0,0 +1,8 @@
+;;; slsets.egg -*- Scheme -*-
+
+((synopsis "Lists of symbols as sets")
+ (author "Peter Bex")
+ (category data)
+ (license "BSD")
+ (test-dependencies test)
+ (components (extension slsets (csc-options "-O3"))))
diff --git a/slsets.scm b/slsets.scm
new file mode 100644
index 0000000..aef229f
--- /dev/null
+++ b/slsets.scm
@@ -0,0 +1,155 @@
+;;
+;; Lists of symbols as sets
+;;
+;; Essentially, this provides specialized versions of SRFI-1's lset
+;; operations for symbols but with linear instead of quadratic time
+;; complexity.  It's even faster than using a side hash table, unless
+;; the symbols already have large plists associated with them.
+;;
+(module slsets
+    (slset<= slset= slset-adjoin ;; TODO: slset-delete and slset-contains?
+     slset-difference slset-intersection
+     slset-difference+intersection
+     slset-union slset-xor
+     slset-deduplicate)
+
+(import scheme (chicken base) (chicken foreign) (chicken plist))
+
+(define-inline (mark! x marking)
+  (when (symbol? x) (put! x marking #t)))
+
+(define-inline (mark-list! lst marking)
+  (for-each (lambda (x) (mark! x marking)) lst))
+
+(define-inline (unmark! x marking)
+  (when (symbol? x) (remprop! x marking)))
+
+(define-inline (unmark-list! lst marking)
+  (for-each (lambda (x) (unmark! x marking)) lst))
+
+(define-inline (with-marked-list lst fun)
+  (let ((marking (gensym 'm)))
+    (dynamic-wind
+        (lambda () (mark-list! lst marking))
+        (lambda () (fun marking))
+        (lambda () (unmark-list! lst marking)))))
+
+(define-inline (every? pred lst)
+  (let lp ((lst lst))
+    (cond ((null? lst) #t)
+          ((not (pred (car lst))) #f)
+          (else (lp (cdr lst))))))
+
+(define-inline (marked? x marking)
+  (and (symbol? x) (get x marking)))
+
+(define (last lst)
+  (if (null? (cdr lst))
+      (car lst)
+      (last (cdr lst))))
+
+(define-inline (filter pred lst)
+  (let lp ((lst lst)
+           (res '()))
+    (cond ((null? lst) (reverse res))
+          ((not (pred (car lst))) (lp (cdr lst) res))
+          (else (lp (cdr lst) (cons (car lst) res))))))
+
+(define (slset<= . lsts)
+  (or (null? lsts)
+      (let ((last-list (last lsts)))
+       (with-marked-list last-list
+                         (lambda (m)
+                           (every? (lambda (lst)
+                                     (or (eq? lst last-list) ; Avoid filtering out this one and building a new list
+                                         (every? (lambda (x) (marked? x m)) lst)))
+                                   lsts))))))
+
+(define (slset= . lsts)
+  (or (null? lsts)
+      (let ((lst (car lsts))
+            (lsts (cdr lsts)))
+        (every? (lambda (l)
+                  (and (slset<= lst l) (slset<= l lst)))
+                lsts))))
+
+(define (slset-adjoin lst . els)
+  (with-marked-list lst
+                    (lambda (m)
+                      (let lp ((lst lst)
+                               (els els))
+                        (cond ((null? els) lst)
+                              ((marked? (car els) m) (lp lst (cdr els)))
+                              (else (mark! (car els) m)
+                                    (lp (cons (car els) lst) (cdr els))))))))
+
+(define (slset-difference lst . lsts)
+  (let lp ((lsts lsts)
+           (res lst))
+    (if (null? lsts)
+        res
+        (lp (cdr lsts)
+            (with-marked-list (car lsts) (lambda (m) (filter (lambda (x) (not (marked? x m))) res)))))))
+
+(define (slset-intersection lst . lsts)
+  (let lp ((lsts lsts)
+           (res lst))
+    (if (null? lsts)
+        res
+        (lp (cdr lsts)
+            (with-marked-list (car lsts)
+                              (lambda (m) (filter (lambda (x) (marked? x m)) res)))))))
+
+;; TODO: Make this faster
+(define (slset-difference+intersection lst . lsts)
+  (values (apply slset-difference lst lsts)
+          (apply slset-intersection lst lsts)))
+
+;; NOTE: Can't use mark-list here, so roll our own!
+;; TODO: Clean up if there's an error cdring down any of these lists?
+(define (slset-union . lsts)
+  (if (null? lsts)
+      '()
+      (let ((marking (gensym 'm))
+            (first-list (car lsts)))
+        (mark-list! first-list marking)
+        (let lp ((lsts (cdr lsts))
+                 (res first-list))
+          (if (null? lsts)
+              (begin (unmark-list! res marking)
+                     res)
+              (let lp2 ((lst (car lsts))
+                        (res res))
+                (if (null? lst)
+                    (lp (cdr lsts) res)
+                    (let ((x (car lst)))
+                      (if (marked? x marking)
+                          (lp2 (cdr lst) res)
+                          (begin (mark! x marking)
+                                 (lp2 (cdr lst) (cons x res))))))))))))
+
+(define (slset-xor . lsts)
+  (let lp ((lsts lsts)
+           (res '()))
+    (if (null? lsts)
+        res
+        (lp (cdr lsts)
+            (append (slset-difference res (car lsts))
+                    (slset-difference (car lsts) res))))))
+
+;; NOTE: Can't use mark-list here, so roll our own!
+;; TODO: Clean up if there's an error cdring down the list?
+(define (slset-deduplicate lst)
+  (let ((marking (gensym 'm)))
+    (let lp ((lst lst)
+             (res '()))
+      (if (null? lst)
+          (begin (unmark-list! res marking)
+                 (reverse res))
+          (let ((x (car lst)))
+            (cond ((marked? x marking)
+                   (lp (cdr lst) res))
+                  (else (mark! x marking)
+                        (lp (cdr lst) (cons x res)))))))))
+
+)
diff --git a/tests/run.scm b/tests/run.scm
new file mode 100644
index 0000000..06f4f2c
--- /dev/null
+++ b/tests/run.scm
@@ -0,0 +1,100 @@
+;; (load "../slsets.scm")
+(import slsets test)
+
+(test-group "slsets"
+  (test-group "slset<="
+    (test-group "examples from srfi-1 document"
+      (test-assert (slset<= '(a) '(a b a) '(a b c c)))
+      (test-assert "trivial case no args" (slset<=))
+      (test-assert "trivial case one arg" (slset<= '(a))))
+    (test-group "negatives"
+      (test-assert (not (slset<= '(a b a) '(a b c c) '(a))))
+      (test-assert (not (slset<= '(a b) '(a c))))))
+
+  (test-group "slset="
+    (test-group "examples from srfi-1 document"
+      (test-assert (slset= '(b e a) '(a e b) '(e e b a)))
+      (test-assert "trivial case no args" (slset=))
+      (test-assert "trivial case one arg" (slset= '(a))))
+    (test-group "negatives"
+      (test-assert (not (slset= '(b e a) '(a e b x) '(e e b a))))
+      (test-assert (not (slset= '(b e a x) '(a e b) '(e e b a))))
+      (test-assert (not (slset= '(b e a) '(a e b) '(e e b a x))))))
+
+  (test-group "slset-adjoin"
+    (test-group "example from srfi-1 document"
+      (test '(u o i a b c d c e)
+            (slset-adjoin '(a b c d c e) 'a 'e 'i 'o 'u)))
+    ;; Spec says it should, so check it
+    (test-group "result shares a common tail with the list argument"
+      (let* ((lst '(a b c d c e))
+             (result (slset-adjoin lst 'a 'x 'e 'b)))
+        (test-assert (eq? lst (cdr result))))))
+
+  (test-group "slset-union"
+    (test-group "examples from srfi-1 document"
+      (test '(u o i a b c d e) (slset-union '(a b c d e) '(a e i o u)))
+      (test "Repeated elements in LIST1 are preserved"
+            '(x a a c)
+            (slset-union '(a a c) '(x a x)))
+      (test "trivial case no args"
+            '()
+            (slset-union))
+      (test "trivial case one arg"
+            '(a b c)
+            (slset-union '(a b c)))))
+
+  (test-group "slset-xor"
+    (test-group "examples from srfi-1 document"
+      ;;(test '(d c b i o u) (slset-xor '(a b c d e) '(a e i o u)))
+      ;; NOTE: The above has a peculiar ordering, but there's nothing
+      ;; in the spec guaranteeing it should be like that.  We return this:
+      (test '(b c d i o u) (slset-xor '(a b c d e) '(a e i o u)))
+      (test "trivial case no args"
+            '() (slset-xor))
+      (test "trivial case one arg"
+            '(a b c d e) (slset-xor '(a b c d e))))
+    (test-group "extra examples"
+      (test "for multiple lists, returns elements that appear in an odd number of lists"
+            '(c d i u e x) (slset-xor '(a b c d e) '(a e i o u) '(b o e) '(x)))
+      (test "duplicates in lists are preserved"
+            '(b b c d d i o u o) (slset-xor '(a b b c d e d) '(a e i o u o)))))
+
+  (test-group "slset-difference"
+    (test-group "examples from srfi-1 document"
+      (test '(b c d) (slset-difference '(a b c d e) '(a e i o u)))
+      (test "trivial case"
+            '(a b c) (slset-difference '(a b c))))
+    (test-group "more examples"
+      (test "duplicates in first list are preserved"
+            '(b b c d)
+            (slset-difference '(a b b c x x d x i e) '(a e i o u) '() '(x x y z z)))
+      (test "empty list returns same list"
+            '(a b b c d e)
+            (slset-difference '(a b b c d e) '()))
+      (test "empty list is empty"
+            '()
+            (slset-difference '() '(a b b c d e)))
+      (test "empty list is still empty after other args"
+            '()
+            (slset-difference '() '(a b) '(c d e) '()))))
+
+  (test-group "slset-intersection"
+    (test-group "examples from srfi-1 document"
+      (test '(a e) (slset-intersection '(a b c d e) '(a e i o u)))
+      (test "repeated elements in LIST1 are preserved"
+            '(a x a) (slset-intersection '(a x y a) '(x a x z)))
+      (test "trivial case"
+            '(a b c) (slset-intersection '(a b c))))
+    (test-group "more examples"
+      (test "multiple lists"
+            '(a a) (slset-intersection '(a x b y b a) '(x a x z) '(a b)))))
+
+  (test-group "slset-deduplicate (as per srfi-1's delete-duplicates)"
+    (test-group "example from srfi-1 document"
+      (test '(a b c z)
+            (slset-deduplicate '(a b a c a b c z))))
+    (test-group "more examples"
+      (test '() (slset-deduplicate '())))))
+
+(test-exit)
author	Peter Bex <peter@more-magic.net>	2025-07-29 14:52:12 +0200
committer	Peter Bex <peter@more-magic.net>	2025-08-05 09:41:44 +0200
commit	84a9205579a6d069a40d23b4bc815bc1a50b8181 (patch)
tree	82a7986e4f60a5d222ff0ceefbe6f199c57843e5
download	slsets-84a9205579a6d069a40d23b4bc815bc1a50b8181.tar.gz