summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeter Bex <peter@more-magic.net>2025-07-29 14:52:12 +0200
committerPeter Bex <peter@more-magic.net>2025-08-05 09:41:44 +0200
commit84a9205579a6d069a40d23b4bc815bc1a50b8181 (patch)
tree82a7986e4f60a5d222ff0ceefbe6f199c57843e5
downloadslsets-84a9205579a6d069a40d23b4bc815bc1a50b8181.tar.gz
Initial implementation of slsets egg
This implements sets of symbols *as lists*, like in the srfi-1 lset operations. It maintains the benefit of using lists for everything, but not having to pay the quadratic performance penalty that srfi-1 entails. We achieve this by using plist operations to "mark" items so we have to visit the list "set" only a fixed number of times.
-rw-r--r--benchmarks/benchmark.scm110
-rw-r--r--slsets.egg8
-rw-r--r--slsets.scm155
-rw-r--r--tests/run.scm100
4 files changed, 373 insertions, 0 deletions
diff --git a/benchmarks/benchmark.scm b/benchmarks/benchmark.scm
new file mode 100644
index 0000000..21d3c43
--- /dev/null
+++ b/benchmarks/benchmark.scm
@@ -0,0 +1,110 @@
+;;
+;; Simple benchmarking program to compare set-difference on hash
+;; tables, lsets and slsets.
+;;
+;; Note that the tests for slsets is severely disadvantaged because
+;; the hash tables are created with the size of the set, which
+;; normally would not be the case because you can't predict ahead of
+;; time how big it would get. Also, the symbols have a short string
+;; value which makes hashing fast. Slsets don't care about the symbol
+;; size and don't need a preallocated size.
+
+(import (chicken fixnum) (chicken string) (chicken process-context) srfi-1 srfi-69 slsets)
+
+(unless (= (length (command-line-arguments)) 2)
+ (error "Need impl name (one of lset, slset, symbolset or srfi-69) and set size"))
+
+(define impl (string->symbol (car (command-line-arguments))))
+(define size (string->number (cadr (command-line-arguments))))
+
+(define-inline (list-tabulate n fun)
+ (let lp ((i 0)
+ (res '()))
+ (if (= i n)
+ res
+ (lp (add1 i)
+ (cons (fun i) res)))))
+
+(define lst (list-tabulate size (lambda (i) (string->symbol (conc "x" i)))))
+(define lst (list-tabulate size (lambda (i) (string->symbol (conc "y" i)))))
+
+(define-inline (filter pred lst)
+ (let lp ((lst lst)
+ (res '()))
+ (cond ((null? lst) (reverse res))
+ ((not (pred (car lst))) (lp (cdr lst) res))
+ (else (lp (cdr lst) (cons (car lst) res))))))
+
+;;;;;;;;;;;;
+;;; Quick and dirty hash table as used in CHICKEN internally
+;;;;;;;;;;;;
+
+(define hash-symbol
+ (let ((cache-s #f)
+ (cache-h #f)
+ ;; NOTE: All low-level hash tables share the same randomization factor
+ (rand (##core#inline "C_rand" #x10000)))
+ (lambda (s n)
+ (if (eq? s cache-s)
+ (##core#inline "C_fixnum_modulo" cache-h n)
+ (let ((bv (##sys#slot s 1)))
+ (set! cache-s s)
+ (set! cache-h (##core#inline "C_u_i_bytevector_hash" bv 0 (fx- (##sys#size bv) 1) rand))
+ (##core#inline "C_fixnum_modulo" cache-h n))))))
+
+(define (make-my-hash-table #!optional (size 301))
+ (make-vector size '()))
+
+(define (my-hash-table-ref ht key)
+ (let loop ((bucket (##sys#slot ht (hash-symbol key (##sys#size ht)))))
+ (and (not (eq? '() bucket))
+ (if (eq? key (##sys#slot (##sys#slot bucket 0) 0))
+ (##sys#slot (##sys#slot bucket 0) 1)
+ (loop (##sys#slot bucket 1))))))
+
+(define (my-hash-table-set! ht key val)
+ (let* ((k (hash-symbol key (##sys#size ht)))
+ (ib (##sys#slot ht k)))
+ (let loop ((bucket ib))
+ (if (eq? '() bucket)
+ (##sys#setslot ht k (cons (cons key val) ib))
+ (if (eq? key (##sys#slot (##sys#slot bucket 0) 0))
+ (##sys#setslot (##sys#slot bucket 0) 1 val)
+ (loop (##sys#slot bucket 1)))))))
+
+(define (symbolset-difference ls . lss)
+ (let ((seen (make-my-hash-table size)))
+ (for-each (lambda (lst)
+ (for-each (lambda (x) (my-hash-table-set! seen x #t)) lst))
+ lss)
+ (filter (lambda (x) (not (my-hash-table-ref seen x))) ls)))
+
+;;;;;;;;;;;;
+
+;; For comparison, a bit strange
+(define (srfi-69-hash-table-difference ht1 ht2)
+ (let ((result (hash-table-copy ht1)))
+ (hash-table-walk ht2 (lambda (k v)
+ (hash-table-delete! result k)))
+ result))
+
+;;;;;;;;;;;;
+
+(let lp ((i 10000))
+ (case impl
+ ((lset)
+ (lset-difference eq? lst (cons 'aha lst)))
+ ((slset)
+ (slset-difference lst (cons 'aha lst)))
+ ((symbolset)
+ (symbolset-difference lst (cons 'aha lst)))
+ ((srfi-69)
+ (when (pair? lst)
+ (let ((ht (make-hash-table eq? symbol-hash size)))
+ (begin
+ (for-each (lambda (x) (hash-table-set! ht x #t)) lst)
+ (set! lst ht))))
+ (srfi-69-hash-table-difference lst lst))
+ (else (error "Unknown impl" impl)))
+ (unless (zero? i)
+ (lp (sub1 i))))
diff --git a/slsets.egg b/slsets.egg
new file mode 100644
index 0000000..fd2b6f3
--- /dev/null
+++ b/slsets.egg
@@ -0,0 +1,8 @@
+;;; slsets.egg -*- Scheme -*-
+
+((synopsis "Lists of symbols as sets")
+ (author "Peter Bex")
+ (category data)
+ (license "BSD")
+ (test-dependencies test)
+ (components (extension slsets (csc-options "-O3"))))
diff --git a/slsets.scm b/slsets.scm
new file mode 100644
index 0000000..aef229f
--- /dev/null
+++ b/slsets.scm
@@ -0,0 +1,155 @@
+;;
+;; Lists of symbols as sets
+;;
+;; Essentially, this provides specialized versions of SRFI-1's lset
+;; operations for symbols but with linear instead of quadratic time
+;; complexity. It's even faster than using a side hash table, unless
+;; the symbols already have large plists associated with them.
+;;
+(module slsets
+ (slset<= slset= slset-adjoin ;; TODO: slset-delete and slset-contains?
+ slset-difference slset-intersection
+ slset-difference+intersection
+ slset-union slset-xor
+ slset-deduplicate)
+
+(import scheme (chicken base) (chicken foreign) (chicken plist))
+
+(define-inline (mark! x marking)
+ (when (symbol? x) (put! x marking #t)))
+
+(define-inline (mark-list! lst marking)
+ (for-each (lambda (x) (mark! x marking)) lst))
+
+(define-inline (unmark! x marking)
+ (when (symbol? x) (remprop! x marking)))
+
+(define-inline (unmark-list! lst marking)
+ (for-each (lambda (x) (unmark! x marking)) lst))
+
+(define-inline (with-marked-list lst fun)
+ (let ((marking (gensym 'm)))
+ (dynamic-wind
+ (lambda () (mark-list! lst marking))
+ (lambda () (fun marking))
+ (lambda () (unmark-list! lst marking)))))
+
+(define-inline (every? pred lst)
+ (let lp ((lst lst))
+ (cond ((null? lst) #t)
+ ((not (pred (car lst))) #f)
+ (else (lp (cdr lst))))))
+
+(define-inline (marked? x marking)
+ (and (symbol? x) (get x marking)))
+
+(define (last lst)
+ (if (null? (cdr lst))
+ (car lst)
+ (last (cdr lst))))
+
+(define-inline (filter pred lst)
+ (let lp ((lst lst)
+ (res '()))
+ (cond ((null? lst) (reverse res))
+ ((not (pred (car lst))) (lp (cdr lst) res))
+ (else (lp (cdr lst) (cons (car lst) res))))))
+
+(define (slset<= . lsts)
+ (or (null? lsts)
+ (let ((last-list (last lsts)))
+ (with-marked-list last-list
+ (lambda (m)
+ (every? (lambda (lst)
+ (or (eq? lst last-list) ; Avoid filtering out this one and building a new list
+ (every? (lambda (x) (marked? x m)) lst)))
+ lsts))))))
+
+(define (slset= . lsts)
+ (or (null? lsts)
+ (let ((lst (car lsts))
+ (lsts (cdr lsts)))
+ (every? (lambda (l)
+ (and (slset<= lst l) (slset<= l lst)))
+ lsts))))
+
+(define (slset-adjoin lst . els)
+ (with-marked-list lst
+ (lambda (m)
+ (let lp ((lst lst)
+ (els els))
+ (cond ((null? els) lst)
+ ((marked? (car els) m) (lp lst (cdr els)))
+ (else (mark! (car els) m)
+ (lp (cons (car els) lst) (cdr els))))))))
+
+(define (slset-difference lst . lsts)
+ (let lp ((lsts lsts)
+ (res lst))
+ (if (null? lsts)
+ res
+ (lp (cdr lsts)
+ (with-marked-list (car lsts) (lambda (m) (filter (lambda (x) (not (marked? x m))) res)))))))
+
+(define (slset-intersection lst . lsts)
+ (let lp ((lsts lsts)
+ (res lst))
+ (if (null? lsts)
+ res
+ (lp (cdr lsts)
+ (with-marked-list (car lsts)
+ (lambda (m) (filter (lambda (x) (marked? x m)) res)))))))
+
+;; TODO: Make this faster
+(define (slset-difference+intersection lst . lsts)
+ (values (apply slset-difference lst lsts)
+ (apply slset-intersection lst lsts)))
+
+;; NOTE: Can't use mark-list here, so roll our own!
+;; TODO: Clean up if there's an error cdring down any of these lists?
+(define (slset-union . lsts)
+ (if (null? lsts)
+ '()
+ (let ((marking (gensym 'm))
+ (first-list (car lsts)))
+ (mark-list! first-list marking)
+ (let lp ((lsts (cdr lsts))
+ (res first-list))
+ (if (null? lsts)
+ (begin (unmark-list! res marking)
+ res)
+ (let lp2 ((lst (car lsts))
+ (res res))
+ (if (null? lst)
+ (lp (cdr lsts) res)
+ (let ((x (car lst)))
+ (if (marked? x marking)
+ (lp2 (cdr lst) res)
+ (begin (mark! x marking)
+ (lp2 (cdr lst) (cons x res))))))))))))
+
+(define (slset-xor . lsts)
+ (let lp ((lsts lsts)
+ (res '()))
+ (if (null? lsts)
+ res
+ (lp (cdr lsts)
+ (append (slset-difference res (car lsts))
+ (slset-difference (car lsts) res))))))
+
+;; NOTE: Can't use mark-list here, so roll our own!
+;; TODO: Clean up if there's an error cdring down the list?
+(define (slset-deduplicate lst)
+ (let ((marking (gensym 'm)))
+ (let lp ((lst lst)
+ (res '()))
+ (if (null? lst)
+ (begin (unmark-list! res marking)
+ (reverse res))
+ (let ((x (car lst)))
+ (cond ((marked? x marking)
+ (lp (cdr lst) res))
+ (else (mark! x marking)
+ (lp (cdr lst) (cons x res)))))))))
+
+)
diff --git a/tests/run.scm b/tests/run.scm
new file mode 100644
index 0000000..06f4f2c
--- /dev/null
+++ b/tests/run.scm
@@ -0,0 +1,100 @@
+;; (load "../slsets.scm")
+(import slsets test)
+
+(test-group "slsets"
+ (test-group "slset<="
+ (test-group "examples from srfi-1 document"
+ (test-assert (slset<= '(a) '(a b a) '(a b c c)))
+ (test-assert "trivial case no args" (slset<=))
+ (test-assert "trivial case one arg" (slset<= '(a))))
+ (test-group "negatives"
+ (test-assert (not (slset<= '(a b a) '(a b c c) '(a))))
+ (test-assert (not (slset<= '(a b) '(a c))))))
+
+ (test-group "slset="
+ (test-group "examples from srfi-1 document"
+ (test-assert (slset= '(b e a) '(a e b) '(e e b a)))
+ (test-assert "trivial case no args" (slset=))
+ (test-assert "trivial case one arg" (slset= '(a))))
+ (test-group "negatives"
+ (test-assert (not (slset= '(b e a) '(a e b x) '(e e b a))))
+ (test-assert (not (slset= '(b e a x) '(a e b) '(e e b a))))
+ (test-assert (not (slset= '(b e a) '(a e b) '(e e b a x))))))
+
+ (test-group "slset-adjoin"
+ (test-group "example from srfi-1 document"
+ (test '(u o i a b c d c e)
+ (slset-adjoin '(a b c d c e) 'a 'e 'i 'o 'u)))
+ ;; Spec says it should, so check it
+ (test-group "result shares a common tail with the list argument"
+ (let* ((lst '(a b c d c e))
+ (result (slset-adjoin lst 'a 'x 'e 'b)))
+ (test-assert (eq? lst (cdr result))))))
+
+ (test-group "slset-union"
+ (test-group "examples from srfi-1 document"
+ (test '(u o i a b c d e) (slset-union '(a b c d e) '(a e i o u)))
+ (test "Repeated elements in LIST1 are preserved"
+ '(x a a c)
+ (slset-union '(a a c) '(x a x)))
+ (test "trivial case no args"
+ '()
+ (slset-union))
+ (test "trivial case one arg"
+ '(a b c)
+ (slset-union '(a b c)))))
+
+ (test-group "slset-xor"
+ (test-group "examples from srfi-1 document"
+ ;;(test '(d c b i o u) (slset-xor '(a b c d e) '(a e i o u)))
+ ;; NOTE: The above has a peculiar ordering, but there's nothing
+ ;; in the spec guaranteeing it should be like that. We return this:
+ (test '(b c d i o u) (slset-xor '(a b c d e) '(a e i o u)))
+ (test "trivial case no args"
+ '() (slset-xor))
+ (test "trivial case one arg"
+ '(a b c d e) (slset-xor '(a b c d e))))
+ (test-group "extra examples"
+ (test "for multiple lists, returns elements that appear in an odd number of lists"
+ '(c d i u e x) (slset-xor '(a b c d e) '(a e i o u) '(b o e) '(x)))
+ (test "duplicates in lists are preserved"
+ '(b b c d d i o u o) (slset-xor '(a b b c d e d) '(a e i o u o)))))
+
+ (test-group "slset-difference"
+ (test-group "examples from srfi-1 document"
+ (test '(b c d) (slset-difference '(a b c d e) '(a e i o u)))
+ (test "trivial case"
+ '(a b c) (slset-difference '(a b c))))
+ (test-group "more examples"
+ (test "duplicates in first list are preserved"
+ '(b b c d)
+ (slset-difference '(a b b c x x d x i e) '(a e i o u) '() '(x x y z z)))
+ (test "empty list returns same list"
+ '(a b b c d e)
+ (slset-difference '(a b b c d e) '()))
+ (test "empty list is empty"
+ '()
+ (slset-difference '() '(a b b c d e)))
+ (test "empty list is still empty after other args"
+ '()
+ (slset-difference '() '(a b) '(c d e) '()))))
+
+ (test-group "slset-intersection"
+ (test-group "examples from srfi-1 document"
+ (test '(a e) (slset-intersection '(a b c d e) '(a e i o u)))
+ (test "repeated elements in LIST1 are preserved"
+ '(a x a) (slset-intersection '(a x y a) '(x a x z)))
+ (test "trivial case"
+ '(a b c) (slset-intersection '(a b c))))
+ (test-group "more examples"
+ (test "multiple lists"
+ '(a a) (slset-intersection '(a x b y b a) '(x a x z) '(a b)))))
+
+ (test-group "slset-deduplicate (as per srfi-1's delete-duplicates)"
+ (test-group "example from srfi-1 document"
+ (test '(a b c z)
+ (slset-deduplicate '(a b a c a b c z))))
+ (test-group "more examples"
+ (test '() (slset-deduplicate '())))))
+
+(test-exit)