Have you ever typed
$ sudo chmod -R 400 *
and then wondered which directory you were in? It happens to the best of us.
I figured this would be a good opportunity to try DuckDB
First, ask Claude 3.5 Sonnet to write a Go program to traverse a path, taking note of each file or directory’s permissions in octal format, and write to Parquet files in chunks of 10,000. Claude managed this with “one shot”, no adjustments needed.
package main
import (
"fmt"
"log"
"os"
"path/filepath"
"github.com/apache/arrow/go/v15/arrow"
"github.com/apache/arrow/go/v15/arrow/array"
"github.com/apache/arrow/go/v15/arrow/memory"
"github.com/apache/arrow/go/v15/parquet"
"github.com/apache/arrow/go/v15/parquet/compress"
"github.com/apache/arrow/go/v15/parquet/pqarrow"
)
type FileRecord struct {
Root string
RelativePath string
OctalPermission string
Type string
}
func main() {
if len(os.Args) != 3 {
fmt.Println("Usage: program <root_directory> <output_parquet_file_prefix>")
os.Exit(1)
}
rootDir := os.Args[1]
outputPrefix := os.Args[2]
records := make([]FileRecord, 0, 10000)
chunkSize := 10000
fileCounter := 0
err := filepath.WalkDir(rootDir, func(path string, d os.DirEntry, err error) error {
if err != nil {
return err
}
fi, err := d.Info()
if err != nil {
return err
}
mode := fi.Mode()
perm := mode.Perm()
octPerm := fmt.Sprintf("%03o", perm)
relPath, err := filepath.Rel(rootDir, path)
if err != nil {
return err
}
var fileType string
if fi.IsDir() {
fileType = "directory"
} else if fi.Mode().IsRegular() {
fileType = "file"
} else {
fileType = "other"
}
record := FileRecord{
Root: rootDir,
RelativePath: relPath,
OctalPermission: octPerm,
Type: fileType,
}
records = append(records, record)
if len(records) >= chunkSize {
err := writeParquetFile(outputPrefix, records, fileCounter)
if err != nil {
return err
}
records = records[:0]
fileCounter++
}
return nil
})
if err != nil {
log.Fatalf("Error walking the path %q: %v\n", rootDir, err)
}
// Write any remaining records
if len(records) > 0 {
err := writeParquetFile(outputPrefix, records, fileCounter)
if err != nil {
log.Fatalf("Error writing parquet file: %v\n", err)
}
}
}
func writeParquetFile(outputPrefix string, records []FileRecord, counter int) error {
fileName := fmt.Sprintf("%s_%d.parquet", outputPrefix, counter)
f, err := os.Create(fileName)
if err != nil {
return err
}
defer f.Close()
schema := arrow.NewSchema(
[]arrow.Field{
{Name: "root", Type: arrow.BinaryTypes.String},
{Name: "relative_path", Type: arrow.BinaryTypes.String},
{Name: "octal_permission", Type: arrow.BinaryTypes.String},
{Name: "type", Type: arrow.BinaryTypes.String},
},
nil,
)
mem := memory.NewGoAllocator()
rb := array.NewRecordBuilder(mem, schema)
defer rb.Release()
root := rb.Field(0).(*array.StringBuilder)
relativePath := rb.Field(1).(*array.StringBuilder)
octalPermission := rb.Field(2).(*array.StringBuilder)
fileType := rb.Field(3).(*array.StringBuilder)
for _, r := range records {
root.Append(r.Root)
relativePath.Append(r.RelativePath)
octalPermission.Append(r.OctalPermission)
fileType.Append(r.Type)
}
arrowRec := rb.NewRecord()
defer arrowRec.Release()
writerProps := parquet.NewWriterProperties(
parquet.WithCompression(compress.Codecs.Snappy),
)
arrowProps := pqarrow.NewArrowWriterProperties(pqarrow.WithStoreSchema())
writer, err := pqarrow.NewFileWriter(schema, f, writerProps, arrowProps)
if err != nil {
return err
}
if err := writer.Write(arrowRec); err != nil {
return err
}
return writer.Close()
}
I ran the Go program in two rsnapshot backups to find out which files had been mangled. This produced a few hundred Parquet files:
$ ls laptop-level0*parquet | wc -l
262
$ ls laptop-level1*parquet | wc -l
218
Next, a quick DuckDB script to read all the files, join on filename, strip a path prefix, and write to an output Parquet file:
create or replace table dir1 as select * from parquet_scan('laptop-level0*parquet');
create or replace table dir2 as select * from parquet_scan('laptop-level1*parquet');
COPY (
SELECT d1.octal_permission, SUBSTR(d1.relative_path, LENGTH('laptop/home/carlo/') + 1) as filepath
FROM dir1 d1
JOIN dir2 d2 ON d1.relative_path = d2.relative_path
WHERE d1.octal_permission != d2.octal_permission
AND d1.relative_path LIKE 'laptop/home/carlo/%'
) TO 'todo.parquet';
This is surprisingly quick. Using Parquet files avoids issues of escaping filenames, dealing with spaces. CSV, never again.
We find over 42,000 files to be changed:
D SELECT COUNT(*) FROM 'todo.parquet';
┌──────────────┐
│ count_star() │
│ int64 │
├──────────────┤
│ 42196 │
└──────────────┘
And it’s so quick to poke around:
D FROM 'todo.parquet' WHERE filepath LIKE '%.vscode/extensions%';
┌──────────────────┬─────────────────────────────────────────────────────────────────────────────────┐
│ octal_permission │ filepath │
│ varchar │ varchar │
├──────────────────┼─────────────────────────────────────────────────────────────────────────────────┤
│ 755 │ .vscode/extensions/valentjn.vscode-ltex-13.1.0/tmp-4sPF5t │
│ 755 │ .vscode/extensions │
│ 755 │ .vscode/extensions/acharluk.easy-cpp-projects-2.0.0 │
│ 755 │ .vscode/extensions/acharluk.easy-cpp-projects-2.0.0/node_modules │
│ 755 │ .vscode/extensions/acharluk.easy-cpp-projects-2.0.0/node_modules/node-fetch │
│ 755 │ .vscode/extensions/acharluk.easy-cpp-projects-2.0.0/node_modules/node-fetch/lib │
│ 755 │ .vscode/extensions/acharluk.easy-cpp-projects-2.0.0/out │
│ 755 │ .vscode/extensions/easycpp_custom_templates │
└──────────────────┴─────────────────────────────────────────────────────────────────────────────────┘
A one-shot session with Claude provided the simple Go program to fix all the permissions using todo.parquet
as input. Then to tidy up I also ran:
$ find . -type f -exec file {} \; | grep -E 'shell script|executable' | cut -d ':' -f 1 | xargs chmod +x
to be conservative and make sure all shell scripts and binaries were executable.
I heard about DuckDB on this episode of Developer Voices.
The guests wrote this book; I’m enjoying it: