2
0
sdess09 2 жил өмнө
parent
commit
575b3b49ba

+ 1 - 0
dashboard/src/components/GPUProvisionSettings.tsx

@@ -26,6 +26,7 @@ const gpuMachineTypeOptions = [
 
   { value: "g4dn.xlarge", label: "g4dn.xlarge" },
   { value: "g4dn.2xlarge", label: "g4dn.2xlarge" },
+  { value: "p4d.24xlarge", label: "p4d.24xlarge" },
 ];
 
 

+ 32 - 4
dashboard/src/lib/hooks/useClusterResourceLimits.ts

@@ -13,6 +13,7 @@ import { z } from "zod";
 import {
   AWS_INSTANCE_LIMITS,
   AZURE_INSTANCE_LIMITS,
+  GPU_INSTANCE_LIMIT,
 } from "main/home/app-dashboard/validate-apply/services-settings/tabs/utils";
 
 import api from "shared/api";
@@ -99,6 +100,7 @@ const clusterNodesValidator = z
         AWS_INSTANCE_LIMITS[DEFAULT_INSTANCE_CLASS][DEFAULT_INSTANCE_SIZE].vCPU,
       maxRAM:
         AWS_INSTANCE_LIMITS[DEFAULT_INSTANCE_CLASS][DEFAULT_INSTANCE_SIZE].RAM,
+      maxGPU: 1,
       instanceClass: DEFAULT_INSTANCE_CLASS,
       instanceSize: DEFAULT_INSTANCE_SIZE,
     };
@@ -106,7 +108,10 @@ const clusterNodesValidator = z
       return defaultResources;
     }
     const workloadKind = data.labels["porter.run/workload-kind"];
-    if (!workloadKind || workloadKind !== "application") {
+    if (
+      !workloadKind ||
+      (workloadKind !== "application" && workloadKind !== "custom")
+    ) {
       return defaultResources;
     }
     const instanceType = data.labels["beta.kubernetes.io/instance-type"];
@@ -115,6 +120,15 @@ const clusterNodesValidator = z
       return defaultResources;
     }
 
+    // update resource limits to the custom GPU limits
+    if (workloadKind === "custom" && GPU_INSTANCE_LIMIT[instanceType]) {
+      const { vCPU, RAM, GPU } = GPU_INSTANCE_LIMIT[instanceType];
+      return {
+        maxCPU: vCPU,
+        maxRAM: RAM,
+        maxGPU: GPU,
+      };
+    }
     // Azure instance types are all prefixed with "Standard_"
     if (instanceType.startsWith("Standard_")) {
       if (AZURE_INSTANCE_LIMITS[instanceType]) {
@@ -122,6 +136,7 @@ const clusterNodesValidator = z
         return {
           maxCPU: vCPU,
           maxRAM: RAM,
+          maxGPU: 1,
           azureType: instanceType,
         };
       } else {
@@ -150,10 +165,12 @@ const clusterNodesValidator = z
 
     const [instanceClass, instanceSize] = parsedType.data;
     if (AWS_INSTANCE_LIMITS[instanceClass]?.[instanceSize]) {
-      const { vCPU, RAM } = AWS_INSTANCE_LIMITS[instanceClass][instanceSize];
+      const { vCPU, RAM, GPU } =
+        AWS_INSTANCE_LIMITS[instanceClass][instanceSize];
       return {
         maxCPU: vCPU,
         maxRAM: RAM,
+        maxGPU: GPU || 1,
         instanceClass,
         instanceSize,
       };
@@ -176,6 +193,7 @@ export const useClusterResourceLimits = ({
   defaultCPU: number;
   defaultRAM: number;
   clusterContainsGPUNodes: boolean;
+  maxGPU: number;
   clusterIngressIp: string;
   loadBalancerType: ClientLoadBalancerType;
 } => {
@@ -183,6 +201,7 @@ export const useClusterResourceLimits = ({
   const LARGE_INSTANCE_UPPER_BOUND = 0.9;
   const DEFAULT_MULTIPLIER = 0.125;
   const [clusterContainsGPUNodes, setClusterContainsGPUNodes] = useState(false);
+  const [maxGPU, setMaxGPU] = useState(1);
   const [maxCPU, setMaxCPU] = useState(
     AWS_INSTANCE_LIMITS[DEFAULT_INSTANCE_CLASS][DEFAULT_INSTANCE_SIZE].vCPU *
       SMALL_INSTANCE_UPPER_BOUND
@@ -287,6 +306,9 @@ export const useClusterResourceLimits = ({
         const maxRAM = data.reduce((acc, curr) => {
           return Math.max(acc, curr.maxRAM);
         }, 0);
+        const maxGPU = data.reduce((acc, curr) => {
+          return Math.max(acc, curr.maxGPU);
+        }, 0);
         let maxMultiplier = SMALL_INSTANCE_UPPER_BOUND;
         // if the instance type has more than 4 GB ram, we use 90% of the ram/cpu
         // otherwise, we use 75%
@@ -301,6 +323,7 @@ export const useClusterResourceLimits = ({
           100;
         setMaxCPU(newMaxCPU);
         setMaxRAM(newMaxRAM);
+        setMaxGPU(maxGPU);
         setDefaultCPU(Number((newMaxCPU * DEFAULT_MULTIPLIER).toFixed(2)));
         setDefaultRAM(Number((newMaxRAM * DEFAULT_MULTIPLIER).toFixed(0)));
       }
@@ -345,9 +368,11 @@ export const useClusterResourceLimits = ({
           return c.kindValues.value.nodeGroups.some(
             (ng) =>
               (ng.nodeGroupType === NodeGroupType.CUSTOM &&
-                ng.instanceType.includes("g4dn")) ||
+                (ng.instanceType.includes("g4dn") ||
+                  ng.instanceType.includes("p4d"))) ||
               (ng.nodeGroupType === NodeGroupType.APPLICATION &&
-                ng.instanceType.includes("g4dn"))
+                (ng.instanceType.includes("g4dn") ||
+                  ng.instanceType.includes("p4d")))
           );
         })
         .with({ kindValues: { case: "gkeKind" } }, (c) => {
@@ -374,6 +399,8 @@ export const useClusterResourceLimits = ({
         })
         .otherwise(() => "UNSPECIFIED");
 
+      // console.log(gpu);
+      // setMaxGPU(gpu);
       setClusterContainsGPUNodes(containsCustomNodeGroup);
       setLoadBalancerType(loadBalancerType);
     }
@@ -385,6 +412,7 @@ export const useClusterResourceLimits = ({
     defaultCPU,
     defaultRAM,
     clusterContainsGPUNodes,
+    maxGPU,
     clusterIngressIp,
     loadBalancerType,
   };

+ 8 - 2
dashboard/src/main/home/app-dashboard/validate-apply/services-settings/ServiceContainer.tsx

@@ -35,6 +35,7 @@ type ServiceProps = {
   status?: ClientServiceStatus[];
   maxCPU: number;
   maxRAM: number;
+  maxGPU: number;
   clusterContainsGPUNodes: boolean;
   internalNetworkingDetails: {
     namespace: string;
@@ -53,6 +54,7 @@ const ServiceContainer: React.FC<ServiceProps> = ({
   status,
   maxCPU,
   maxRAM,
+  maxGPU,
   clusterContainsGPUNodes,
   internalNetworkingDetails,
   clusterIngressIp,
@@ -67,6 +69,7 @@ const ServiceContainer: React.FC<ServiceProps> = ({
           service={svc}
           maxCPU={maxCPU}
           maxRAM={maxRAM}
+          maxGPU={maxGPU}
           clusterContainsGPUNodes={clusterContainsGPUNodes}
           internalNetworkingDetails={internalNetworkingDetails}
           clusterIngressIp={clusterIngressIp}
@@ -79,6 +82,7 @@ const ServiceContainer: React.FC<ServiceProps> = ({
           service={svc}
           maxCPU={maxCPU}
           maxRAM={maxRAM}
+          maxGPU={maxGPU}
           clusterContainsGPUNodes={clusterContainsGPUNodes}
         />
       ))
@@ -88,6 +92,7 @@ const ServiceContainer: React.FC<ServiceProps> = ({
           service={svc}
           maxCPU={maxCPU}
           maxRAM={maxRAM}
+          maxGPU={maxGPU}
           clusterContainsGPUNodes={clusterContainsGPUNodes}
         />
       ))
@@ -97,6 +102,7 @@ const ServiceContainer: React.FC<ServiceProps> = ({
           service={svc}
           maxCPU={maxCPU}
           maxRAM={maxRAM}
+          maxGPU={maxGPU}
           clusterContainsGPUNodes={clusterContainsGPUNodes}
           isPredeploy
         />
@@ -212,7 +218,7 @@ const ServiceTitle = styled.div`
   align-items: center;
 `;
 
-const StyledSourceBox = styled(motion.div)<{
+const StyledSourceBox = styled(motion.div) <{
   showExpanded?: boolean;
   hasFooter?: boolean;
 }>`
@@ -275,7 +281,7 @@ const ServiceHeader = styled.div<{
     border-radius: 20px;
     margin-left: -10px;
     transform: ${(props: { showExpanded?: boolean }) =>
-      props.showExpanded ? "" : "rotate(-90deg)"};
+    props.showExpanded ? "" : "rotate(-90deg)"};
   }
 `;
 

+ 2 - 0
dashboard/src/main/home/app-dashboard/validate-apply/services-settings/ServiceList.tsx

@@ -78,6 +78,7 @@ const ServiceList: React.FC<ServiceListProps> = ({
     currentClusterResources: {
       maxCPU,
       maxRAM,
+      maxGPU,
       clusterContainsGPUNodes,
       clusterIngressIp,
       defaultCPU,
@@ -234,6 +235,7 @@ const ServiceList: React.FC<ServiceListProps> = ({
                 status={serviceVersionStatus?.[svc.name.value]}
                 maxCPU={maxCPU}
                 maxRAM={maxRAM}
+                maxGPU={maxGPU}
                 clusterContainsGPUNodes={clusterContainsGPUNodes}
                 internalNetworkingDetails={internalNetworkingDetails}
                 clusterIngressIp={clusterIngressIp}

+ 10 - 7
dashboard/src/main/home/app-dashboard/validate-apply/services-settings/tabs/JobTabs.tsx

@@ -24,6 +24,7 @@ type Props = {
   maxRAM: number;
   maxCPU: number;
   clusterContainsGPUNodes: boolean;
+  maxGPU: number;
   isPredeploy?: boolean;
 };
 
@@ -33,6 +34,7 @@ const JobTabs: React.FC<Props> = ({
   maxRAM,
   clusterContainsGPUNodes,
   maxCPU,
+  maxGPU,
   isPredeploy,
 }) => {
   const { control, register } = useFormContext<PorterAppFormData>();
@@ -42,14 +44,14 @@ const JobTabs: React.FC<Props> = ({
 
   const tabs = isPredeploy
     ? [
-        { label: "Main", value: "main" as const },
-        { label: "Resources", value: "resources" as const },
-      ]
+      { label: "Main", value: "main" as const },
+      { label: "Resources", value: "resources" as const },
+    ]
     : [
-        { label: "Main", value: "main" as const },
-        { label: "Resources", value: "resources" as const },
-        { label: "Advanced", value: "advanced" as const },
-      ];
+      { label: "Main", value: "main" as const },
+      { label: "Resources", value: "resources" as const },
+      { label: "Advanced", value: "advanced" as const },
+    ];
 
   return (
     <>
@@ -67,6 +69,7 @@ const JobTabs: React.FC<Props> = ({
             index={index}
             maxCPU={maxCPU}
             maxRAM={maxRAM}
+            maxGPU={maxGPU}
             clusterContainsGPUNodes={clusterContainsGPUNodes}
             service={service}
             isPredeploy={isPredeploy}

+ 38 - 0
dashboard/src/main/home/app-dashboard/validate-apply/services-settings/tabs/Resources.tsx

@@ -35,12 +35,14 @@ type ResourcesProps = {
   service: ClientService;
   isPredeploy?: boolean;
   clusterContainsGPUNodes: boolean;
+  maxGPU: number;
 };
 
 const Resources: React.FC<ResourcesProps> = ({
   index,
   maxCPU,
   maxRAM,
+  maxGPU,
   service,
   clusterContainsGPUNodes,
   isPredeploy = false,
@@ -73,6 +75,10 @@ const Resources: React.FC<ResourcesProps> = ({
     readOnly: false,
     value: 0,
   });
+  const gpu = watch(`app.services.${index}.gpu.enabled`, {
+    readOnly: false,
+    value: false,
+  });
 
   return (
     <>
@@ -259,6 +265,7 @@ const Resources: React.FC<ResourcesProps> = ({
                         <span>Enable GPU</span>
                       </>
                     </Text>
+
                     {!clusterContainsGPUNodes && (
                       <>
                         <Spacer inline x={1} />
@@ -295,6 +302,37 @@ const Resources: React.FC<ResourcesProps> = ({
                 </>
               )}
             />
+            {maxGPU > 1 && gpu.value && (
+              <>
+                <Spacer y={1} />
+                <Controller
+                  name={`app.services.${index}.gpu`}
+                  control={control}
+                  render={({ field: { value, onChange } }) => (
+                    <InputSlider
+                      label="GPU"
+                      unit=""
+                      min={0}
+                      max={maxGPU}
+                      value={value?.gpuCoresNvidia.value ?? "1"}
+                      disabled={value?.readOnly}
+                      setValue={(e) => {
+                        onChange({
+                          ...value,
+                          gpuCoresNvidia: {
+                            ...value.gpuCoresNvidia,
+                            value: e,
+                          },
+                        });
+                      }}
+                      disabledTooltip={
+                        "You may only edit this field in your porter.yaml."
+                      }
+                    />
+                  )}
+                />
+              </>
+            )}
             {currentCluster.status === "UPDATING" &&
               !clusterContainsGPUNodes && (
                 <CheckItemContainer>

+ 3 - 0
dashboard/src/main/home/app-dashboard/validate-apply/services-settings/tabs/WebTabs.tsx

@@ -20,6 +20,7 @@ type Props = {
   };
   maxRAM: number;
   maxCPU: number;
+  maxGPU: number;
   clusterContainsGPUNodes: boolean;
   internalNetworkingDetails: {
     namespace: string;
@@ -34,6 +35,7 @@ const WebTabs: React.FC<Props> = ({
   service,
   maxRAM,
   maxCPU,
+  maxGPU,
   clusterContainsGPUNodes,
   internalNetworkingDetails,
   clusterIngressIp,
@@ -71,6 +73,7 @@ const WebTabs: React.FC<Props> = ({
             index={index}
             maxCPU={maxCPU}
             maxRAM={maxRAM}
+            maxGPU={maxGPU}
             clusterContainsGPUNodes={clusterContainsGPUNodes}
             service={service}
           />

+ 3 - 0
dashboard/src/main/home/app-dashboard/validate-apply/services-settings/tabs/WorkerTabs.tsx

@@ -18,6 +18,7 @@ type Props = {
   };
   maxRAM: number;
   maxCPU: number;
+  maxGPU: number;
   clusterContainsGPUNodes: boolean;
 };
 
@@ -26,6 +27,7 @@ const WorkerTabs: React.FC<Props> = ({
   service,
   maxCPU,
   maxRAM,
+  maxGPU,
   clusterContainsGPUNodes,
 }) => {
   const [currentTab, setCurrentTab] = React.useState<
@@ -51,6 +53,7 @@ const WorkerTabs: React.FC<Props> = ({
             maxCPU={maxCPU}
             maxRAM={maxRAM}
             service={service}
+            maxGPU={maxGPU}
             clusterContainsGPUNodes={clusterContainsGPUNodes}
           />
         ))

+ 27 - 0
dashboard/src/main/home/app-dashboard/validate-apply/services-settings/tabs/utils.ts

@@ -3,6 +3,7 @@ export const MILI_TO_CORE = 1000;
 type InstanceDetails = {
   vCPU: number;
   RAM: number;
+  GPU?: number;
 };
 
 type InstanceTypes = Record<string, Record<string, InstanceDetails>>;
@@ -133,3 +134,29 @@ export const AZURE_INSTANCE_LIMITS: AzureInstanceTypes = Object.freeze({
   Standard_DS2_v2: { vCPU: 2, RAM: 7 },
   Standard_D2ads_v5: { vCPU: 2, RAM: 8 },
 });
+
+export const GPU_INSTANCE_LIMIT: InstanceTypes = Object.freeze({
+  g4dn: {
+    xlarge: { vCPU: 4, RAM: 16, GPU: 1 },
+    "2xlarge": { vCPU: 8, RAM: 32, GPU: 1 },
+  },
+  p4d: {
+    "24xlarge": { vCPU: 96, RAM: 1152, GPU: 8 },
+  },
+  n1: {
+    "standard-1": { vCPU: 1, RAM: 3.75, GPU: 1 },
+    "standard-2": { vCPU: 2, RAM: 7.5, GPU: 1 },
+    "standard-4": { vCPU: 4, RAM: 15, GPU: 1 },
+    "standard-8": { vCPU: 8, RAM: 30, GPU: 1 },
+    "standard-16": { vCPU: 16, RAM: 60, GPU: 1 },
+    "standard-32": { vCPU: 32, RAM: 120, GPU: 1 },
+    "high-mem-2": { vCPU: 2, RAM: 13, GPU: 1 },
+    "high-mem-4": { vCPU: 4, RAM: 26, GPU: 1 },
+    "high-mem-8": { vCPU: 8, RAM: 52, GPU: 1 },
+    "high-mem-16": { vCPU: 16, RAM: 104, GPU: 1 },
+    "high-mem-32": { vCPU: 32, RAM: 208, GPU: 1 },
+    "high-cpu-8": { vCPU: 2, RAM: 1.8, GPU: 1 },
+    "high-cpu-16": { vCPU: 4, RAM: 3.6, GPU: 1 },
+    "high-cpu-32": { vCPU: 8, RAM: 7.2, GPU: 1 },
+  },
+});

+ 3 - 0
dashboard/src/shared/ClusterResourcesContext.tsx

@@ -15,6 +15,7 @@ export type ClusterResources = {
   clusterContainsGPUNodes: boolean;
   clusterIngressIp: string;
   loadBalancerType: ClientLoadBalancerType;
+  maxGPU: number;
 };
 
 export const ClusterResourcesContext = createContext<{
@@ -45,6 +46,7 @@ const ClusterResourcesProvider = ({
     maxRAM,
     defaultCPU,
     defaultRAM,
+    maxGPU,
     clusterContainsGPUNodes,
     clusterIngressIp,
     loadBalancerType,
@@ -62,6 +64,7 @@ const ClusterResourcesProvider = ({
           maxRAM,
           defaultCPU,
           defaultRAM,
+          maxGPU,
           clusterContainsGPUNodes,
           clusterIngressIp,
           loadBalancerType,