csv_export.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420
  1. package costmodel
  2. import (
  3. "bytes"
  4. "context"
  5. "encoding/csv"
  6. "io"
  7. "sort"
  8. "strconv"
  9. "time"
  10. "github.com/pkg/errors"
  11. "github.com/opencost/opencost/pkg/kubecost"
  12. "github.com/opencost/opencost/pkg/log"
  13. )
  14. type CloudStorage interface {
  15. Write(name string, data []byte) error
  16. Read(name string) ([]byte, error)
  17. Exists(name string) (bool, error)
  18. }
  19. type AllocationModel interface {
  20. ComputeAllocation(start, end time.Time, resolution time.Duration) (*kubecost.AllocationSet, error)
  21. DateRange() (time.Time, time.Time, error)
  22. }
  23. var errNoData = errors.New("no data")
  24. // UpdateCSVWorker launches a worker that updates CSV file in cloud storage with allocation data
  25. // It updates data immediately on launch and then runs every day at 00:10 UTC
  26. // It expected to run a goroutine
  27. func UpdateCSVWorker(ctx context.Context, storage CloudStorage, model AllocationModel, path string) error {
  28. // perform first update immediately
  29. nextRunAt := time.Now()
  30. for {
  31. select {
  32. case <-ctx.Done():
  33. return ctx.Err()
  34. case <-time.After(nextRunAt.Sub(time.Now())):
  35. log.Infof("Updating CSV file: %s", path)
  36. err := UpdateCSV(ctx, storage, model, path)
  37. if err != nil {
  38. // it's background worker, log error and carry on, maybe next time it will work
  39. log.Errorf("Error updating CSV: %s", err)
  40. }
  41. now := time.Now().UTC()
  42. // next launch is at 00:10 UTC tomorrow
  43. // extra 10 minutes is to let prometheus to collect all the data for the previous day
  44. nextRunAt = time.Date(now.Year(), now.Month(), now.Day(), 0, 10, 0, 0, now.Location()).AddDate(0, 0, 1)
  45. }
  46. }
  47. }
  48. func UpdateCSV(ctx context.Context, storage CloudStorage, model AllocationModel, path string) error {
  49. exporter := &csvExporter{
  50. Storage: storage,
  51. Model: model,
  52. FilePath: path,
  53. }
  54. return exporter.Update(ctx)
  55. }
  56. type csvExporter struct {
  57. Storage CloudStorage
  58. Model AllocationModel
  59. FilePath string
  60. }
  61. // Update updates CSV file in cloud storage with new allocation data
  62. // TODO: currently everything is processed in memory, it might be a problem for large clusters
  63. // it's possible to switch to temporary files, but it will require upgrading all storage provider to work with files
  64. func (e *csvExporter) Update(ctx context.Context) error {
  65. allocationDates, err := e.availableAllocationDates()
  66. if err != nil {
  67. return err
  68. }
  69. if len(allocationDates) == 0 {
  70. return errors.New("no data to export from prometheus")
  71. }
  72. exist, err := e.Storage.Exists(e.FilePath)
  73. if err != nil {
  74. return err
  75. }
  76. var result []byte
  77. // cloud storage doesn't have an existing file
  78. // dump all the data exist to the file
  79. if !exist {
  80. result, err = e.allocationsToCSV(ctx, mapTimeToSlice(allocationDates))
  81. if err != nil {
  82. return err
  83. }
  84. }
  85. // existing export file exists
  86. // scan through it and ignore all dates that are already in the file
  87. // avoid modifying existing data or producing duplicates
  88. if exist {
  89. previousExport, err := e.Storage.Read(e.FilePath)
  90. if err != nil {
  91. return err
  92. }
  93. csvDates, err := e.loadDates(previousExport)
  94. if err != nil {
  95. return err
  96. }
  97. for date := range csvDates {
  98. delete(allocationDates, date)
  99. }
  100. if len(allocationDates) == 0 {
  101. log.Info("export file in cloud storage already contain data for all dates, skipping update")
  102. return nil
  103. }
  104. dateExport, err := e.allocationsToCSV(ctx, mapTimeToSlice(allocationDates))
  105. if err != nil {
  106. return err
  107. }
  108. result, err = mergeCSV([][]byte{previousExport, dateExport})
  109. if err != nil {
  110. return err
  111. }
  112. }
  113. err = e.Storage.Write(e.FilePath, result)
  114. if err != nil {
  115. return err
  116. }
  117. log.Infof("Updated CSV file %s", e.FilePath)
  118. return nil
  119. }
  120. func (e *csvExporter) availableAllocationDates() (map[time.Time]struct{}, error) {
  121. start, end, err := e.Model.DateRange()
  122. if err != nil {
  123. return nil, err
  124. }
  125. if start != time.Date(start.Year(), start.Month(), start.Day(), 0, 0, 0, 0, time.UTC) {
  126. // start doesn't start from 00:00 UTC, it could be truncated by prometheus retention policy
  127. // skip incomplete data and begin from the day after, otherwise it may corrupt existing data
  128. start = time.Date(start.Year(), start.Month(), start.Day(), 0, 0, 0, 0, time.UTC).AddDate(0, 0, 1)
  129. }
  130. end = time.Date(end.Year(), end.Month(), end.Day(), 0, 0, 0, 0, time.UTC)
  131. dates := make(map[time.Time]struct{})
  132. for date := start; date.Before(end); date = date.AddDate(0, 0, 1) {
  133. dates[date] = struct{}{}
  134. }
  135. if len(dates) == 0 {
  136. return nil, errNoData
  137. }
  138. return dates, nil
  139. }
  140. func (e *csvExporter) allocationsToCSV(ctx context.Context, dates []time.Time) ([]byte, error) {
  141. buf := new(bytes.Buffer)
  142. err := e.writeCSVToWriter(ctx, buf, dates)
  143. if err != nil {
  144. return nil, err
  145. }
  146. return buf.Bytes(), nil
  147. }
  148. func (e *csvExporter) writeCSVToWriter(ctx context.Context, w io.Writer, dates []time.Time) error {
  149. fmtFloat := func(f float64) string {
  150. return strconv.FormatFloat(f, 'f', -1, 64)
  151. }
  152. csvWriter := csv.NewWriter(w)
  153. // TODO: confirm columns we want to export
  154. err := csvWriter.Write([]string{
  155. "Date",
  156. "Namespace",
  157. "ControllerKind",
  158. "ControllerName",
  159. "Pod",
  160. "Container",
  161. "CPUCoreUsageAverage",
  162. "CPUCoreRequestAverage",
  163. "RAMBytesUsageAverage",
  164. "RAMBytesRequestAverage",
  165. "NetworkReceiveBytes",
  166. "NetworkTransferBytes",
  167. "GPUs",
  168. "PVBytes",
  169. "CPUCost",
  170. "RAMCost",
  171. "NetworkCost",
  172. "PVCost",
  173. "GPUCost",
  174. "TotalCost",
  175. })
  176. if err != nil {
  177. return err
  178. }
  179. lines := 0
  180. for _, date := range dates {
  181. start := time.Date(date.Year(), date.Month(), date.Day(), 0, 0, 0, 0, time.UTC)
  182. end := start.AddDate(0, 0, 1)
  183. data, err := e.Model.ComputeAllocation(start, end, 5*time.Minute)
  184. if err != nil {
  185. return err
  186. }
  187. log.Infof("fetched %d records for %s", len(data.Allocations), date.Format("2006-01-02"))
  188. // TODO: does it need to be aggregated by namespace+controller?
  189. // container-level information can be too noisy for most users
  190. for _, alloc := range data.Allocations {
  191. if err := ctx.Err(); err != nil {
  192. return err
  193. }
  194. log.Infof("%f", alloc.TotalCost())
  195. err := csvWriter.Write([]string{
  196. date.Format("2006-01-02"),
  197. alloc.Properties.Namespace,
  198. alloc.Properties.ControllerKind,
  199. alloc.Properties.Controller,
  200. alloc.Properties.Pod,
  201. alloc.Properties.Container,
  202. fmtFloat(alloc.CPUCoreUsageAverage),
  203. fmtFloat(alloc.CPUCoreRequestAverage),
  204. fmtFloat(alloc.RAMBytesUsageAverage),
  205. fmtFloat(alloc.RAMBytesRequestAverage),
  206. fmtFloat(alloc.NetworkReceiveBytes),
  207. fmtFloat(alloc.NetworkTransferBytes),
  208. fmtFloat(alloc.GPUs()),
  209. fmtFloat(alloc.PVBytes()),
  210. fmtFloat(alloc.CPUTotalCost()),
  211. fmtFloat(alloc.RAMTotalCost()),
  212. fmtFloat(alloc.NetworkTotalCost()),
  213. fmtFloat(alloc.PVCost()),
  214. fmtFloat(alloc.GPUCost),
  215. fmtFloat(alloc.TotalCost()),
  216. })
  217. if err != nil {
  218. return err
  219. }
  220. lines++
  221. }
  222. }
  223. if lines == 0 {
  224. return errNoData
  225. }
  226. csvWriter.Flush()
  227. if err := csvWriter.Error(); err != nil {
  228. return err
  229. }
  230. log.Infof("exported %d lines", lines)
  231. return nil
  232. }
  233. // loadDate scans through CSV export file and extract all dates from "Date" column
  234. func (e *csvExporter) loadDates(csvFile []byte) (map[time.Time]struct{}, error) {
  235. csvReader := csv.NewReader(bytes.NewReader(csvFile))
  236. header, err := csvReader.Read()
  237. if err != nil {
  238. return nil, errors.Wrap(err, "reading csv header")
  239. }
  240. dateColIndex := 0
  241. for i, col := range header {
  242. if col == "Date" {
  243. dateColIndex = i
  244. break
  245. }
  246. }
  247. dates := make(map[time.Time]struct{})
  248. for {
  249. row, err := csvReader.Read()
  250. if errors.Is(err, io.EOF) {
  251. break
  252. }
  253. if err != nil {
  254. return nil, errors.Wrap(err, "reading csv row")
  255. }
  256. date, err := time.Parse("2006-01-02", row[dateColIndex])
  257. if err != nil {
  258. return nil, errors.Wrap(err, "parsing date")
  259. }
  260. dates[date] = struct{}{}
  261. }
  262. return dates, nil
  263. }
  264. // mergeCSV merges multiple csv files into one.
  265. // Files may have different headers, but the result will have a header that is a union of all headers.
  266. // The main goal here is to allow changing CSV format without breaking or loosing existing data.
  267. func mergeCSV(files [][]byte) ([]byte, error) {
  268. var err error
  269. headers := make([][]string, 0, len(files))
  270. csvReaders := make([]*csv.Reader, 0, len(files))
  271. // first, get information about the result header
  272. for _, file := range files {
  273. csvReader := csv.NewReader(bytes.NewReader(file))
  274. header, err := csvReader.Read()
  275. if errors.Is(err, io.EOF) {
  276. // ignore empty files
  277. continue
  278. }
  279. if err != nil {
  280. return nil, errors.Wrap(err, "reading header of csv file")
  281. }
  282. headers = append(headers, header)
  283. csvReaders = append(csvReaders, csvReader)
  284. }
  285. mapping, header := combineHeaders(headers)
  286. output := new(bytes.Buffer)
  287. csvWriter := csv.NewWriter(output)
  288. err = csvWriter.Write(mergeHeaders(headers))
  289. if err != nil {
  290. return nil, errors.Wrap(err, "writing header to csv file")
  291. }
  292. for csvIndex, csvReader := range csvReaders {
  293. for {
  294. inputLine, err := csvReader.Read()
  295. if errors.Is(err, io.EOF) {
  296. break
  297. }
  298. if err != nil {
  299. return nil, errors.Wrap(err, "reading csv file line")
  300. }
  301. outputLine := make([]string, len(header))
  302. for colIndex := range header {
  303. destColIndex, ok := mapping[csvIndex][colIndex]
  304. if !ok {
  305. continue
  306. }
  307. outputLine[destColIndex] = inputLine[colIndex]
  308. }
  309. err = csvWriter.Write(outputLine)
  310. if err != nil {
  311. return nil, errors.Wrap(err, "writing line to csv file")
  312. }
  313. }
  314. }
  315. csvWriter.Flush()
  316. if csvWriter.Error() != nil {
  317. return nil, errors.Wrapf(csvWriter.Error(), "flushing csv file")
  318. }
  319. return output.Bytes(), nil
  320. }
  321. func combineHeaders(headers [][]string) ([]map[int]int, []string) {
  322. result := make([]string, 0)
  323. indices := make([]map[int]int, len(headers))
  324. for i, header := range headers {
  325. indices[i] = make(map[int]int)
  326. for j, column := range header {
  327. if !contains(result, column) {
  328. result = append(result, column)
  329. indices[i][j] = len(result) - 1
  330. } else {
  331. indices[i][j] = indexOf(result, column)
  332. }
  333. }
  334. }
  335. return indices, result
  336. }
  337. func mergeHeaders(headers [][]string) []string {
  338. result := make([]string, 0)
  339. for _, header := range headers {
  340. for _, column := range header {
  341. if !contains(result, column) {
  342. result = append(result, column)
  343. }
  344. }
  345. }
  346. return result
  347. }
  348. func contains(slice []string, item string) bool {
  349. for _, element := range slice {
  350. if element == item {
  351. return true
  352. }
  353. }
  354. return false
  355. }
  356. func indexOf(slice []string, element string) int {
  357. for i, e := range slice {
  358. if e == element {
  359. return i
  360. }
  361. }
  362. return -1
  363. }
  364. func mapTimeToSlice(data map[time.Time]struct{}) []time.Time {
  365. result := make([]time.Time, 0, len(data))
  366. for key := range data {
  367. result = append(result, key)
  368. }
  369. sort.Slice(result, func(i, j int) bool {
  370. return result[i].Before(result[j])
  371. })
  372. return result
  373. }