Skip to content

Commit

Permalink
Respect files without styles
Browse files Browse the repository at this point in the history
fix
  • Loading branch information
velios committed Nov 3, 2023
1 parent 11342a8 commit 424788b
Show file tree
Hide file tree
Showing 6 changed files with 69 additions and 56 deletions.
10 changes: 8 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
# Changelog

0.0.8 / 2023-11-03
------------------
- Changed
- Fix parse xlsx without xl/styles.xml
- Refactoring

0.0.7 / 2023-10-27
------------------
- Changed
- Fix parse xlsx without xl/sharedStrings.xml
- Fix parse cell with number and inlineStr data types
- Fix parse xlsx without xl/sharedStrings.xml
- Fix parse cell with number and inlineStr data types

0.0.6 / 2023-09-16
------------------
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ In the first example below we pull data from a specific sheet within the workboo
#!/usr/bin/env bb
(require '[babashka.deps :as deps])
(deps/add-deps
'{:deps {com.github.kbosompem/bb-excel {:mvn/version "0.0.6"}}})
'{:deps {com.github.kbosompem/bb-excel {:mvn/version "0.0.8"}}})

(ns demo
(:require [clojure.java.io :as io]
Expand Down
4 changes: 2 additions & 2 deletions bb-excel
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

(require '[babashka.deps :as deps])

(deps/add-deps '{:deps {com.github.kbosompem/bb-excel {:mvn/version "0.0.6"}}})
(deps/add-deps '{:deps {com.github.kbosompem/bb-excel {:mvn/version "0.0.8"}}})

(ns bb-excel
(:require [bb-excel.core :refer [get-sheets get-range get-sheet]]
Expand Down Expand Up @@ -115,7 +115,7 @@
(defn help
"Command line options"
[summary]
(->> ["bb-excel 0.0.6"
(->> ["bb-excel 0.0.8"
""
"Usage: bb-excel input-file options"
""
Expand Down
4 changes: 2 additions & 2 deletions bbexcel
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

(require '[babashka.deps :as deps])

(deps/add-deps '{:deps {com.github.kbosompem/bb-excel {:mvn/version "0.0.6"}}})
(deps/add-deps '{:deps {com.github.kbosompem/bb-excel {:mvn/version "0.0.8"}}})

(ns bbexcel
(:require [bb-excel.core :refer [get-sheets get-range get-sheet]]
Expand Down Expand Up @@ -115,7 +115,7 @@
(defn help
"Command line options"
[summary]
(->> ["bbexcel 0.0.6"
(->> ["bbexcel 0.0.8"
""
"Usage: bbexcel input-file options"
""
Expand Down
2 changes: 1 addition & 1 deletion project.clj
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
(defproject com.github.kbosompem/bb-excel "0.0.7"
(defproject com.github.kbosompem/bb-excel "0.0.8"
:description "A Simple Clojure/Babashka Library for Reading Data from Excel Files"
:url "https://github.com/kbosompem/bb-excel"
:license {:name "EPL-2.0"
Expand Down
103 changes: 55 additions & 48 deletions src/bb_excel/core.clj
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
(ns bb-excel.core
(:require [clojure.data.xml :refer [parse-str]]
(:require [clojure.string :as str]
[clojure.data.xml :refer [parse-str]]
[clojure.java.io :as io]
[clojure.set :refer [rename-keys]])
(:import [java.io File]
Expand Down Expand Up @@ -132,68 +133,74 @@

(defn process-cell
"Process Excel cell"
[dict styles coll]
(let [[[_ row col]] (re-seq #"([A-Z]*)([0-9]+)" (:r coll))
u (-> coll
(assoc :x row)
(assoc :y col))]
[dict styles cell]
(let [[_ row-index col-index] (re-matches #"([A-Z]+)([0-9]+)" (:r cell))
cell* (merge cell
{:x row-index
:y col-index})
cell-type (:t cell*)
cell-value (:d cell*)]
(cond
;; Possible data types well explained here https://stackoverflow.com/a/18346273
(= (:t u) "s") (dissoc (assoc-in u [:d] (dict (read-string (:d u)))) :t)
(= (:t u) "str") (dissoc u :t)
(= (:t u) "inlineStr") (dissoc u :t)
(= (:t u) "b") (dissoc (assoc-in u [:d] (if (= "1" (:d u)) true false)) :t)
(= (:t u) "e") (assoc-in u [:d] (error-codes (:d u)))
(= (:t u) "n") (assoc u :d (parse-long (:d u)))
(style-check u styles pcts) (assoc-in u [:d] (num2pct (:d u)))
(style-check u styles dates) (assoc-in u [:d] (num2date (:d u)))
(style-check u styles times) (assoc-in u [:d] (num2time (:d u)))
:else u)))
;; Possible cell-value types well explained here https://stackoverflow.com/a/18346273
(= cell-type "s") (assoc cell* :d (get dict (parse-long cell-value)))
(= cell-type "str") cell*
(= cell-type "inlineStr") cell*
(= cell-type "b") (assoc cell* :d (if (= "1" cell-value) true false))
(= cell-type "e") (assoc cell* :d (get error-codes cell-value))
(= cell-type "n") (assoc cell* :d (parse-long cell-value))
(style-check cell* styles pcts) (assoc-in cell* [:d] (num2pct cell-value))
(style-check cell* styles dates) (assoc-in cell* [:d] (num2date cell-value))
(style-check cell* styles times) (assoc-in cell* [:d] (num2time cell-value))
:else cell*)))

(defn- get-row-index
[row]
(parse-long (:y row)))

(defn process-row
"Process Excel row of data"
[dict styles coll]
(reduce #(merge % {:_r (read-string (:y %2)) (keyword (:x %2)) (:d %2)}) {}
(map (partial process-cell dict styles)
(map #(merge (first %) {:d (second %)}
{:f (nth % 2)})
(map (juxt :attrs
(comp last :content last :content)
(comp first :content first :content)) coll)))))
[dict styles row]
(let [row* (->> row
(map (fn [cell] (merge (:attrs cell)
{:d (-> cell :content last :content last)})))
(map (partial process-cell dict styles)))
row-index (get-row-index (first row*))]
(into {:_r row-index} (map #(-> [(keyword (:x %)) (:d %)]))
row*)))

(defn- get-cell-text
"Extract "
[coll]
(apply str
(mapcat :content
(filter #((:text-t tags) (:tag %))
(xml-seq coll)))))
"Extract text from cell"
[cell]
(->> (xml-seq cell)
(filter #(contains? (:text-t tags) (:tag %)))
(mapcat :content)
(str/join)))

(defn get-unique-strings
"Get dictionary of all unique strings in the Excel spreadsheet"
[^ZipFile zipfile]
(if-let [wb (.getEntry zipfile (str "xl/sharedStrings.xml"))]
(if-let [wb (.getEntry zipfile "xl/sharedStrings.xml")]
(let [ins (.getInputStream zipfile wb)
x (parse-str (slurp ins))]
(->>
(filter #((:text-part tags) (:tag %)) (xml-seq x))
(map get-cell-text)
(zipmap (range))))
(->> (xml-seq x)
(filter #((:text-part tags) (:tag %)))
(map get-cell-text)
(zipmap (range))))
{}))

(defn get-styles
"Get styles"
[^ZipFile zipfile]
(let [wb (.getEntry zipfile (str "xl/styles.xml"))
ins (.getInputStream zipfile wb)
x (parse-str (slurp ins))]
(->> x
xml-seq
(filter #((:cellxfs tags) (:tag %)))
first
:content
(filter #((:xf tags) (:tag %)))
(mapv (comp :numFmtId :attrs)))))
(if-let [wb (.getEntry zipfile "xl/styles.xml")]
(let [ins (.getInputStream zipfile wb)
x (parse-str (slurp ins))]
(->> (xml-seq x)
(filter #((:cellxfs tags) (:tag %)))
first
:content
(filter #((:xf tags) (:tag %)))
(mapv (comp :numFmtId :attrs))))
[]))

(defn get-sheet
"Get sheet from file or filename"
Expand Down Expand Up @@ -226,7 +233,7 @@
styles (get-styles zipfile)
xx (slurp ins)
x (parse-str xx)
d (->> x :content
d (->> (:content x)
(filter #((:sheet-data tags) (:tag %)))
first :content
(map :content)
Expand Down Expand Up @@ -274,7 +281,7 @@
(defn parse-range
"Takes in an Excel coordinate and returns a hashmap of rows and columns to pull"
[s]
(let [[[_ osc osr oec oer]] (re-seq #"([A-Z]+)([0-9]*)[:]?([A-Z]*)([0-9]*)" s)
(let [[_ osc osr oec oer] (re-matches #"([A-Z]+)([0-9]*)[:]?([A-Z]*)([0-9]*)" s)
sc (or osc "A")
ec (or (when-str oec) (when-str osc) sc)
sr (or (when-num osr) 1)
Expand Down

0 comments on commit 424788b

Please sign in to comment.